[Pkg-ceph-commits] [ceph] 02/07: Imported Upstream version 10.1.1

James Downing Page jamespage at moszumanska.debian.org
Fri Apr 8 15:35:38 UTC 2016


This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch ubuntu-xenial
in repository ceph.

commit d0f518a027b607f737398e81da632cfbb7b4c454
Author: James Page <james.page at ubuntu.com>
Date:   Fri Apr 8 12:09:17 2016 +0100

    Imported Upstream version 10.1.1
---
 AUTHORS                                            |  17 +-
 ChangeLog                                          | 369 ++++++++-
 ceph.spec                                          |  59 +-
 ceph.spec.in                                       |  57 +-
 configure                                          |  48 +-
 configure.ac                                       |  10 +-
 doc/Makefile.am                                    |   3 +-
 doc/Makefile.in                                    |   3 +-
 doc/man/8/rbdmap.rst                               |  48 ++
 etc/sysconfig/ceph                                 |   2 +-
 man/Makefile-client.am                             |   3 +-
 man/Makefile.in                                    |   3 +-
 man/ceph-authtool.8                                |   2 +-
 man/ceph-clsinfo.8                                 |   2 +-
 man/ceph-conf.8                                    |   2 +-
 man/ceph-create-keys.8                             |   2 +-
 man/ceph-debugpack.8                               |   2 +-
 man/ceph-dencoder.8                                |   2 +-
 man/ceph-deploy.8                                  |   2 +-
 man/ceph-detect-init.8                             |   2 +-
 man/ceph-disk.8                                    |   2 +-
 man/ceph-fuse.8                                    |   2 +-
 man/ceph-mds.8                                     |   2 +-
 man/ceph-mon.8                                     |   2 +-
 man/ceph-osd.8                                     |   2 +-
 man/ceph-post-file.8                               |   2 +-
 man/ceph-rbdnamer.8                                |   2 +-
 man/ceph-rest-api.8                                |   2 +-
 man/ceph-run.8                                     |   2 +-
 man/ceph-syn.8                                     |   2 +-
 man/ceph.8                                         |   2 +-
 man/cephfs.8                                       |   2 +-
 man/crushtool.8                                    |   2 +-
 man/librados-config.8                              |   2 +-
 man/monmaptool.8                                   |   2 +-
 man/mount.ceph.8                                   |   2 +-
 man/osdmaptool.8                                   |   2 +-
 man/rados.8                                        |   2 +-
 man/radosgw-admin.8                                |   2 +-
 man/radosgw.8                                      |   2 +-
 man/rbd-fuse.8                                     |   2 +-
 man/rbd-mirror.8                                   |   2 +-
 man/rbd-nbd.8                                      |   2 +-
 man/rbd-replay-many.8                              |   2 +-
 man/rbd-replay-prep.8                              |   2 +-
 man/rbd-replay.8                                   |   2 +-
 man/rbd.8                                          |   2 +-
 man/{ceph-run.8 => rbdmap.8}                       |  31 +-
 src/.git_version                                   |   4 +-
 src/Makefile-env.am                                |   5 +-
 src/Makefile.am                                    |   5 +
 src/Makefile.in                                    | 286 +++++--
 src/bash_completion/ceph                           |  95 +--
 src/ceph-detect-init/Makefile.am                   |   2 +-
 .../ceph_detect_init/debian/__init__.py            |  21 +
 src/ceph-detect-init/run-tox.sh                    |   5 +
 src/ceph-disk/Makefile.am                          |   2 +-
 src/ceph.in                                        |  98 ++-
 src/client/Client.cc                               |  44 +-
 src/client/Client.h                                |   2 +-
 src/client/SyntheticClient.cc                      |   5 +-
 src/cls/journal/cls_journal.cc                     |  20 +-
 src/cls/rbd/cls_rbd.cc                             | 195 +++--
 src/cls/rbd/cls_rbd_client.cc                      |  42 +-
 src/cls/rbd/cls_rbd_client.h                       |  10 +-
 src/cls/rgw/cls_rgw.cc                             |  20 +-
 src/cls/rgw/cls_rgw_client.cc                      |   3 +-
 src/cls/rgw/cls_rgw_client.h                       |   2 +-
 src/cls/rgw/cls_rgw_ops.h                          |   9 +-
 src/common/Cycles.cc                               |  11 +-
 src/common/TrackedOp.cc                            |  26 +-
 src/common/TrackedOp.h                             |  36 +-
 src/common/config.h                                |   8 +-
 src/common/config_opts.h                           |  36 +-
 src/crush/CrushWrapper.cc                          |   2 +-
 src/global/global_init.cc                          |   9 +-
 src/global/signal_handler.cc                       |   9 +-
 src/include/ceph_fs.h                              |   2 +-
 src/include/rados/rgw_file.h                       |  27 +-
 src/journal/JournalMetadata.cc                     | 122 +++
 src/journal/JournalMetadata.h                      |   3 +
 src/journal/Journaler.cc                           |  10 +
 src/journal/Journaler.h                            |   5 +-
 src/libcephfs.cc                                   |   7 +-
 src/librbd/ExclusiveLock.cc                        |   3 +-
 src/librbd/ImageCtx.cc                             |  33 +
 src/librbd/ImageCtx.h                              |  22 +
 src/librbd/ImageWatcher.cc                         |   5 +-
 src/librbd/Journal.cc                              | 249 ++++--
 src/librbd/Journal.h                               |  29 +-
 src/librbd/Makefile.am                             |  13 +
 src/librbd/MirroringWatcher.cc                     | 121 +++
 src/librbd/MirroringWatcher.h                      |  72 ++
 src/librbd/ObjectWatcher.cc                        | 348 +++++++++
 src/librbd/ObjectWatcher.h                         | 155 ++++
 src/librbd/WatchNotifyTypes.cc                     |   9 +-
 src/librbd/WatchNotifyTypes.h                      |   5 +-
 src/librbd/exclusive_lock/AcquireRequest.cc        |  15 +-
 src/librbd/exclusive_lock/Policy.h                 |  20 +
 src/librbd/exclusive_lock/StandardPolicy.cc        |  21 +
 src/librbd/exclusive_lock/StandardPolicy.h         |  30 +
 src/librbd/internal.cc                             | 714 +++++++++++++-----
 src/librbd/internal.h                              |   7 +
 src/librbd/journal/Policy.h                        |  24 +
 src/librbd/journal/Replay.cc                       |   9 +
 src/librbd/journal/Replay.h                        |   2 +
 src/librbd/journal/StandardPolicy.cc               |  34 +
 src/librbd/journal/StandardPolicy.h                |  30 +
 src/librbd/journal/TypeTraits.h                    |  26 +
 src/librbd/journal/Types.cc                        |  40 +
 src/librbd/journal/Types.h                         |  20 +-
 src/librbd/mirroring_watcher/Types.cc              | 160 ++++
 src/librbd/mirroring_watcher/Types.h               | 102 +++
 src/mds/Beacon.cc                                  |   9 +-
 src/mds/Beacon.h                                   |   7 +-
 src/mds/CInode.cc                                  |  10 +-
 src/mds/CInode.h                                   |   4 +
 src/mds/FSMap.cc                                   |  56 +-
 src/mds/FSMap.h                                    |   6 +-
 src/mds/MDCache.cc                                 |   5 +
 src/mds/MDSDaemon.cc                               |   4 +-
 src/mds/MDSMap.cc                                  |  75 +-
 src/mds/MDSMap.h                                   |   5 +-
 src/mds/MDSRank.cc                                 |  14 +-
 src/mds/ScrubStack.cc                              |  10 +-
 src/mds/Server.cc                                  |  39 +-
 src/messages/MFSMap.h                              |  14 +-
 src/messages/MMDSBeacon.h                          |  16 +-
 src/messages/MOSDOp.h                              |  15 +-
 src/mon/MDSMonitor.cc                              |  43 +-
 src/mon/MDSMonitor.h                               |   4 +-
 src/mon/MonClient.cc                               |  36 +-
 src/mon/MonClient.h                                |   1 +
 src/mon/MonCommands.h                              |   7 +-
 src/mon/Monitor.cc                                 |   4 +-
 src/mon/OSDMonitor.cc                              |  44 +-
 src/mon/PGMap.cc                                   |   6 +-
 src/mon/PGMonitor.cc                               | 117 +--
 src/mon/PGMonitor.h                                |   5 +-
 src/mon/Paxos.cc                                   |  19 -
 src/mon/Paxos.h                                    |   5 -
 src/mon/PaxosService.cc                            |  22 -
 src/mon/PaxosService.h                             |   7 -
 src/msg/Messenger.cc                               |   3 +-
 src/msg/async/AsyncConnection.cc                   |   6 +-
 src/msg/xio/XioMessenger.cc                        | 242 +++---
 src/msg/xio/XioMessenger.h                         |  12 +-
 src/objclass/class_api.cc                          |  14 +
 src/objclass/objclass.h                            |   4 +
 src/os/ObjectStore.h                               |  22 +-
 src/os/Transaction.cc                              |  11 +
 src/os/bluestore/BlueStore.cc                      | 449 +++++++----
 src/os/bluestore/BlueStore.h                       |  57 +-
 src/os/bluestore/FreelistManager.cc                |   8 +
 src/os/bluestore/KernelDevice.cc                   |   8 +-
 src/os/bluestore/bluefs_tool.cc                    |  12 +-
 src/os/filestore/FileStore.cc                      |   6 +
 src/osd/ClassHandler.cc                            |   6 +-
 src/osd/OSD.cc                                     | 218 +++---
 src/osd/OSD.h                                      |   7 +-
 src/osd/OSDMap.cc                                  |   4 +-
 src/osd/PG.cc                                      | 100 ++-
 src/osd/PG.h                                       |  16 +-
 src/osd/ReplicatedPG.cc                            |  72 +-
 src/osd/osd_types.cc                               |  15 +-
 src/osd/osd_types.h                                |  10 +-
 src/osdc/Objecter.cc                               |  22 +-
 src/osdc/Striper.cc                                |   2 +-
 src/pybind/Makefile.am                             |   5 +-
 src/pybind/ceph_argparse.py                        |  22 +
 src/pybind/cephfs/cephfs.pyx                       |  54 +-
 src/pybind/cephfs/setup.py                         |   3 +-
 src/pybind/rados/rados.pyx                         |   2 +-
 src/pybind/rados/setup.py                          |   3 +-
 src/pybind/rbd/setup.py                            |   3 +-
 src/rbdmap                                         |  28 +-
 src/rgw/rgw_admin.cc                               |  76 +-
 src/rgw/rgw_civetweb.cc                            |   9 +-
 src/rgw/rgw_cr_rados.cc                            |   1 +
 src/rgw/rgw_data_sync.cc                           |  76 +-
 src/rgw/rgw_file.cc                                | 255 ++++---
 src/rgw/rgw_file.h                                 |  60 +-
 src/rgw/rgw_op.cc                                  |   3 +-
 src/rgw/rgw_rados.cc                               | 182 +++--
 src/rgw/rgw_rados.h                                |  15 +-
 src/rgw/rgw_rest_client.cc                         |  36 +-
 src/rgw/rgw_rest_client.h                          |   2 +-
 src/rgw/rgw_rest_realm.cc                          |  16 +-
 src/rgw/rgw_rest_s3.cc                             |  20 +-
 src/rgw/rgw_sync.cc                                |   6 +-
 src/rgw/rgw_sync.h                                 |  15 +-
 src/script/subman                                  |  20 +
 src/stop.sh                                        |   2 +-
 src/test/Makefile-client.am                        |  15 +-
 src/test/Makefile.am                               |   1 +
 src/test/centos-6/ceph.spec.in                     |  57 +-
 src/test/centos-7/ceph.spec.in                     |  57 +-
 src/test/ceph_objectstore_tool.py                  |  12 +-
 src/test/cli/crushtool/check-invalid-map.t         |   3 +
 src/test/cli/radosgw-admin/help.t                  |  11 +-
 src/test/cli/rbd/help.t                            |   5 +-
 src/test/cls_journal/test_cls_journal.cc           |   7 +
 src/test/cls_rbd/test_cls_rbd.cc                   |  37 +-
 src/test/encoding/check-generated.sh               |  40 +-
 src/test/encoding/readable.sh                      | 107 ++-
 src/test/encoding/types.h                          |   2 +
 src/test/erasure-code/TestErasureCodePlugin.cc     |   8 +
 src/test/erasure-code/test-erasure-code.sh         |   2 +-
 src/test/fedora-21/ceph.spec.in                    |  57 +-
 src/test/librados/list.cc                          |  19 +
 src/test/librados/misc.cc                          |   3 +
 src/test/librados/pool.cc                          |   3 +-
 src/test/librados/tier.cc                          |  43 +-
 src/test/librados_test_stub/LibradosTestStub.cc    |  10 +
 src/test/librados_test_stub/MockTestMemIoCtxImpl.h |  22 +
 .../exclusive_lock/test_mock_AcquireRequest.cc     |  66 +-
 src/test/librbd/mock/MockImageCtx.cc               |  10 +
 src/test/librbd/mock/MockImageCtx.h                |  15 +
 src/test/librbd/mock/MockImageState.h              |   3 +
 src/test/librbd/mock/MockJournal.h                 |   7 +-
 src/test/librbd/mock/MockJournalPolicy.h           |  21 +
 src/test/librbd/mock/MockOperations.h              |   1 +
 src/test/librbd/test_MirroringWatcher.cc           | 100 +++
 src/test/librbd/test_main.cc                       |   2 +
 src/test/librbd/test_mirroring.cc                  | 251 ++++++-
 src/test/librbd/test_mock_Journal.cc               |   1 +
 src/test/librbd/test_mock_ObjectWatcher.cc         | 405 ++++++++++
 src/test/librgw_file_aw.cc                         |  29 +-
 src/test/librgw_file_cd.cc                         |  36 +-
 src/test/librgw_file_nfsns.cc                      |  74 +-
 src/test/mon/test_pool_quota.sh                    |  61 ++
 src/test/msgr/perf_msgr_client.cc                  |   1 +
 src/test/msgr/perf_msgr_server.cc                  |   1 -
 .../objectstore/ObjectStoreTransactionBenchmark.cc |   1 +
 src/test/objectstore/store_test.cc                 | 518 +++++++++----
 src/test/opensuse-13.2/ceph.spec.in                |  57 +-
 src/test/osd/osd-scrub-repair.sh                   |  49 +-
 src/test/perf_local.cc                             |   1 +
 src/test/rbd_mirror/image_replay.cc                |  12 +-
 .../image_sync/test_mock_SyncPointPruneRequest.cc  |   2 +
 src/test/rbd_mirror/mock/MockJournaler.cc          |   5 +
 src/test/rbd_mirror/mock/MockJournaler.h           | 107 +++
 src/test/rbd_mirror/test_ImageReplayer.cc          | 120 +--
 src/test/rbd_mirror/test_ImageSync.cc              |   1 +
 src/test/rbd_mirror/test_PoolWatcher.cc            |  18 +-
 src/test/rbd_mirror/test_mock_ImageReplayer.cc     | 161 ++++
 src/test/system/rados_delete_pools_parallel.cc     |   2 +-
 src/test/system/rados_list_parallel.cc             |   2 +-
 src/test/system/rados_open_pools_parallel.cc       |  22 +-
 src/test/system/st_rados_create_pool.cc            |  19 +-
 src/test/system/st_rados_create_pool.h             |   2 +
 src/test/system/st_rados_list_objects.cc           |   5 +-
 src/test/system/systest_runnable.h                 |   2 +-
 src/test/test_pool_create.sh                       |   2 +-
 src/tools/cephfs/DataScan.cc                       |  38 +-
 src/tools/cephfs/DataScan.h                        |   4 +-
 src/tools/cephfs/Dumper.h                          |   2 -
 src/tools/cephfs/MDSUtility.cc                     |   6 +-
 src/tools/cephfs/MDSUtility.h                      |   2 +-
 src/tools/crushtool.cc                             |   7 +-
 src/tools/rbd/Shell.cc                             |  33 +-
 src/tools/rbd/Shell.h                              |   5 +-
 src/tools/rbd/action/MirrorImage.cc                |   8 +-
 src/tools/rbd/rbd.cc                               |   2 +-
 src/tools/rbd_mirror/ImageReplayer.cc              | 832 +++++++++++----------
 src/tools/rbd_mirror/ImageReplayer.h               | 227 +++---
 src/tools/rbd_mirror/ImageSync.h                   |  12 +-
 src/tools/rbd_mirror/Mirror.cc                     | 137 +++-
 src/tools/rbd_mirror/Mirror.h                      |   6 +
 src/tools/rbd_mirror/PoolWatcher.cc                |  37 +-
 src/tools/rbd_mirror/PoolWatcher.h                 |  24 +-
 src/tools/rbd_mirror/Replayer.cc                   | 214 +++++-
 src/tools/rbd_mirror/Replayer.h                    |  18 +-
 .../rbd_mirror/image_replayer/BootstrapRequest.cc  | 469 ++++++++++--
 .../rbd_mirror/image_replayer/BootstrapRequest.h   | 102 ++-
 .../rbd_mirror/image_replayer/CloseImageRequest.cc |  12 +-
 .../rbd_mirror/image_replayer/CloseImageRequest.h  |  10 +-
 .../image_replayer/OpenLocalImageRequest.cc        | 100 ++-
 .../image_replayer/OpenLocalImageRequest.h         |  14 +-
 .../rbd_mirror/image_sync/ImageCopyRequest.cc      |  31 +-
 src/tools/rbd_mirror/image_sync/ImageCopyRequest.h |   3 +-
 .../rbd_mirror/image_sync/ObjectCopyRequest.cc     |   6 +-
 .../rbd_mirror/image_sync/SnapshotCopyRequest.cc   |   9 +-
 .../rbd_mirror/image_sync/SnapshotCopyRequest.h    |   2 +-
 .../image_sync/SyncPointCreateRequest.cc           |   7 +-
 .../rbd_mirror/image_sync/SyncPointCreateRequest.h |   2 +-
 .../rbd_mirror/image_sync/SyncPointPruneRequest.cc |  10 +-
 .../rbd_mirror/image_sync/SyncPointPruneRequest.h  |   2 +-
 src/vstart.sh                                      | 100 +--
 systemd/rbdmap.service                             |   2 +
 290 files changed, 9251 insertions(+), 3035 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 192b334..f0a01f6 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -145,6 +145,7 @@ Eleanor Cawthon <eleanor.cawthon at inktank.com>
 Emile Snyder <emsnyder at ebay.com>
 Emily Popper <emily.popper at dreamhost.com>
 Eric Cook <llua at gmx.com>
+Eric Lee <eric.lee at hgst.com>
 Eric Mourgaya <eric.mourgaya at arkea.com>
 Erik Logtenberg <erik at logtenberg.eu>
 Erwan Velu <erwan at redhat.com>
@@ -207,6 +208,7 @@ Ilja Slepnev <islepnev at gmail.com>
 Ilya Dryomov <idryomov at redhat.com>
 Ilya Dryomov <ilya.dryomov at inktank.com>
 Ilya Shipitsin <ilia at localhost.localdomain>
+Ira Cooper <ira at redhat.com>
 Ira Cooper <ira at samba.org>
 Ismael Serrano <ismael.serrano at gmail.com>
 Ivan Grcic <igrcic at gmail.com>
@@ -226,7 +228,7 @@ Jean-Rémi Deveaux <jeanremi.deveaux at gmail.com>
 Jeff Epstein <jepst79 at gmail.com>
 Jeffrey Lu <lzhng2000 at aliyun.com>
 Jeff Weber <jweber at cofront.net>
-Jenkins Build Slave User <jenkins-build at trusty-small-unique--e64d6d03-305d-46bd-9a2c-9b546e06937e.localdomain>
+Jenkins Build Slave User <jenkins-build at trusty-small-unique--5c6e9c4e-81af-43d3-957d-c650c692c441.localdomain>
 Jenkins <jenkins at ceph.com>
 Jens-Christian Fischer <jens-christian.fischer at switch.ch>
 Jeremy Qian <vanpire110 at 163.com>
@@ -302,7 +304,6 @@ Li Peng <lip at dtdream.com>
 Li Tianqing <tianqing at unitedstack.com>
 Liu Peiyan <liu.peiyang at h3c.com>
 Li Wang <li.wang at kylin-cloud.com>
-liyankun <liyankun at unitedstack.com>
 Lluis Pamies-Juarez <lluis.pamies-juarez at hgst.com>
 Loic Dachary <ldachary at redhat.com>
 Loic Dachary <loic-201408 at dachary.org>
@@ -359,6 +360,7 @@ Nicolas Yong <nicolas.yong93 at gmail.com>
 Nikola Kotur <kotnick at gmail.com>
 Nilamdyuti Goswami <ngoswami at redhat.com>
 Ning Yao <yaoning at ruijie.com.cn>
+Ning Yao <yaoning at unitedstack.com>
 Nishtha Rai <nishtha3rai at gmail.com>
 Noah Watkins <nwatkins at redhat.com>
 (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9>
@@ -383,6 +385,7 @@ Pierre Chaumont <pierre.chaumont31 at gmail.com>
 Pierre Rognant <prognant at oodrive.com>
 Piotr Dałek <piotr.dalek at ts.fujitsu.com>
 Qiankun Zheng <zheng.qiankun at h3c.com>
+Qinghua Jin <qhjin_dev at 163.com>
 Rachana Patel <rachana83.patel at gmail.com>
 Radoslaw Zarzynski <rzarzynski at mirantis.com>
 Rahul Aggarwal <rahul.1aggarwal at gmail.com>
@@ -395,10 +398,10 @@ Ren Huanwen <ren.huanwen at zte.com.cn>
 Ricardo Dias <rdias at suse.com>
 riccardo80 <riccardo80 at 29311d96-e01e-0410-9327-a35deaab8ce9>
 Riccardo Ferretti <rferrett at soe.ucsc.edu>
+Richard W.M. Jones <rjones at redhat.com>
 Roald J. van Loon <roald at roaldvanloon.nl>
 Robert Jansen <r.jansen at fairbanks.nl>
 Robert LeBlanc <robert.leblanc at endurance.com>
-Robert LeBlanc <robert at leblancnet.us>
 Robin Dehu <robindehu at gmail.com>
 Robin H. Johnson <robbat2 at gentoo.org>
 Robin H. Johnson <robin.johnson at dreamhost.com>
@@ -414,7 +417,7 @@ Ross Turk <rturk at redhat.com>
 Ruben Kerkhof <ruben at rubenkerkhof.com>
 Ruifeng Yang <yangruifeng.09209 at h3c.com>
 runsisi <runsisi at hust.edu.cn>
-RustShen <rustinpeace at 163.com>
+Rust Shen <rustinpeace at 163.com>
 Rutger ter Borg <rutger at terborg.net>
 Sage Weil <sage at inktank.com>
 Sage Weil <sweil at redhat.com>
@@ -427,6 +430,7 @@ Sandon Van Ness <sandon at inktank.com>
 Sandon Van Ness <svanness at redhat.com>
 Sangdi Xu <xu.sangdi at h3c.com>
 Sarthak Munshi <sarthakmunshi at gmail.com>
+scienceluo <luo.kexue at zte.com.cn>
 Scott A. Brandt <scott at cs.ucsc.edu>
 Scott Devoid <devoid at anl.gov>
 Sean Channel <pentabular at gmail.com>
@@ -443,13 +447,13 @@ Shun Song <song.shun3 at zte.com.cn>
 Shu, Xinxin <xinxin.shu at intel.com>
 Shylesh Kumar <shmohan at redhat.com>
 Siddharth Sharma <siddharth at redhat.com>
-Signed-off-by: Eric Lee <eric.lee at hgst.com>
 Simone Gotti <simone.gotti at gmail.com>
 Simon Leinen <simon.leinen at switch.ch>
 Somnath Roy <somnath.roy at sandisk.com>
 Sondra.Menthers <sondra.menthers at dreamhost.com>
 Song Baisen <song.baisen at zte.com.cn>
 Stanislav Sedov <stas at FreeBSD.org>
+Star Guo <star.guo at mevoco.com>
 Stefan Eilemann <Stefan.Eilemann at epfl.ch>
 Stephan Renatus <s.renatus at x-ion.de>
 Stephen F Taylor <steveftaylor at gmail.com>
@@ -460,6 +464,7 @@ Steve Stock <steve at technolope.org>
 Stratos Psomadakis <psomas at grnet.gr>
 Stuart Longland <stuartl at vrt.com.au>
 Subramanyam Varanasi <s.varanasi at ssi.samsung.com>
+sunspot <sunspot0105 at gmail.com>
 Sushma Gurram <sushma.gurram at sandisk.com>
 Swami Reddy <swami.reddy at ril.com>
 Sylvain Baubeau <sbaubeau at redhat.com>
@@ -498,6 +503,7 @@ Venky Shankar <vshankar at redhat.com>
 Vicente Cheng <freeze.bilsted at gmail.com>
 Vikhyat Umrao <vumrao at redhat.com>
 Viktor Suprun <popsul1993 at gmail.com>
+Vitja Makarov <vitja.makarov at gmail.com>
 Volker Assmann <volker at twisted-nerve.de>
 VRan Liu <gliuwr at gmail.com>
 Vu Pham <vu at mellanox.com>
@@ -521,6 +527,7 @@ Xan Peng <xanpeng at gmail.com>
 Xavier Roche <roche+git at exalead.com>
 Xiangwei Wu <wuxiangwei at h3c.com>
 Xiaowei Chen <chen.xiaowei at h3c.com>
+Xiaoxi Chen <xiaoxchen at ebay.com>
 Xiaoxi Chen <xiaoxi.chen at intel.com>
 Xie Rui <875016668 at qq.com>
 Xie Xingguo <xie.xingguo at zte.com.cn>
diff --git a/ChangeLog b/ChangeLog
index 1ed7bd1..4c75c68 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,41 +1,371 @@
-96ae8bd (HEAD, tag: v10.1.0, origin/jewel) 10.1.0
+ce50389 (HEAD, tag: v10.1.1, origin/jewel) 10.1.1
+02ab8a2 mrun: update path to cmake binaries
+d248128 config: fix setuser_match_path typo
+d5ec33f tests: Removing one ceph-dencoder call in check-generated.sh
+4af1aa6 tests: Fixing python statement in ceph_objectstore_tool.py
+c5fa83f tests: Avoiding a fixed 10sec sleep in test_mon_cephdf_commands()
+62bdde2 tests: Optmizing sleep sequence in cephtool/test.sh
+8a49a86 tests: Moving sleep call after action in ceph_watch_wait()
+1b7991e tests: Reducing sleep loops in ceph_objectstore_tool
+0d254d8 tests: Reducing sleep time for osd to be up
+0eea243 tests: Optimizing kill_daemons() sleep time
+0dccb6c tests: Making "objectstore" calls parallel in osd-scrub-repair.sh
+84197f1 tests: Optimizing wait_for_clean()
+b3f7392 tests: Reducing commands in get_num_active_clean()
+d8f07c3 tests: Killing daemons in parallel
+0ac3ac7 tests: Adding parallelism to check-generated.sh
+d66c852 tests: Adding parallelism for sequential ceph-dencoder calls
+8b6be11 tests: Adding parallelism to encoding/readable.sh
+db31cc6 tests: Adding parallelism helpers in ceph-helpers.sh
+93ace63 cmake: fix the build of test_rados_api_list
+b7a5f8b test: TestMirroringWatcher test cases were not closing images
+8231208 global/global_init: expand metavariables in setuser_match_path
+dd167cf crush: fix error log
+f47e06b tests: Fixing broken test/cephtool-test-mon.sh test
+9565a50 set 128MB tcmalloc cache size by bytes
+ff9843b Striper: reduce assemble_result log level
+f812199 qa/workunits/rbd: qemu tests need to wait for image to be created
+f713766 ceph_test_rados_api_tier: parse env
+769c0af ceph_test_rados_api_list: parse env
+b9b07c1 osd/ReplicatedPG: tolerate pgls commands with full hash
+5c612e8 osd/ReplicatedPG: discard pgnls op that is outside pg bounds
+f16187f osdc/Objecter: use full hash value for pg[n]ls ops
+9196a75 osd/ReplicatedPG: fix typo
+a92fa83 osdmap: rm nonused variable
+4eb8f77 CMake: For CMake version <= 2.8.11, use LINK_PRIVATE
+c432691 os/ObjectStore: add noexcept to ensure move ctor is used
+1c2831a common/Cycles: Do not initialize Cycles globally.
+ec79b64 unittest_erasure_code_plugin: fix deadlock caused by locked mutex in cancelled thread
+aedc529 test: Fix test to run with btrfs which has snap_### dirs
+3dd5249 librbd: avoid throwing error if mirroring is unsupported
+280b8a1 rgw: add exclusive flag to set_as_default()
+7567b45 rgw: add exclusive flag to RGWRealm::create_control
+27e4c46 rgw_admin: improve the orphans find help
+c4efef5 rgw: add a few missing cmdline switches in help
+09b5356 cls_journal: fix -EEXIST checking
+2c0f03a rgw_admin: remove unused parent_period arg
+a29b96a debian/rules: put init-ceph in /etc/init.d/ceph, not ceph-base
+602425a configure: Add -D_LARGEFILE64_SOURCE to Linux build.
+639f158 mon: remove unnecessary comment for update_from_paxos
+f5ef4d4 cmake: add missing librbd/MirrorWatcher.cc and librd/ObjectWatcher.cc
+919ff4f releases: what is merged where and when ?
+9f60e07 releases: update understanding the release cycle
+c1c71ec release-notes: v10.0.5 release notes
+535eb0e rgw: fix compiling error
+4e1e81a qa/workunits/rbd: use cluster1/2 instead local/remote as cluster names
+2226019 qa/workunits/rbd: add helper to determine rbd-mirror local cluster
+d034539 qa: update rest test cephfs calls (part 2)
+70b1303 qa/workunits/rbd: improvements for manual testing
+81a55d3 bash_completion: now bash completion uses improved "ceph --completion" Signed-off-by: Adam Kupczyk <a.kupczyk at mirantis.com>
+a891d94 tools: Auto complete feature for CLI.
+a58b774 rgw: Multipart ListPartsResult ETag quotes
+f6be50d os/filestore: fix a -Wunused-label warning in compiling.
+c97ce4f test/objectstore: fix a -Wsign-compare warning in compiling.
+565ab25 test/system: fix a -Wsign-compare warning in compiling.
+1f8f43d cls_rbd: fix -EEXIST checking in cls::rbd::image_set
+77391b3 librados_test_stub: add new cls_cxx_read2/write2 methods
+231edd7 cls_rbd: pass WILLNEED fadvise flag for object map update ops
+0510301 objclass: add cls_cxx_read2/cls_cxx_write2
+7e58045 OSD::handle_pg_create: check same_primary_since
+57db617 librbd: integrate listener for new mirroring notification payloads
+e95a383 librbd: send notifications on mirroring updates
+3748b88 librbd: helper methods for mirroring notifications
+934ce86 librbd: mirroring object notification types
+3145109 librbd: generic object watcher and mirroring watcher
+bb07a1b rbd: rbd-mirroring: Automatically disable image mirroring when image is removed
+f254486 os/bluestore: _do_write: fix _do_zero_tail_extent to handle shared extents
+2ed445e os/bluestore: _do_zero: simply truncate up if past eof
+dcc5cea os/bluestore: prevent rename src from getting trimmed from lru
+d9fc4aa os/bluestore: _do_zero: handle unzeroed tail blocks
+65e9252 os/bluestore: _do_write_zero: preserve onode size
+903365b os/bluestore: _do_write: handle gap from eof to offset in wal path
+874aad3 os/bluestore: _do_allocate: fix whitespace
+52314ca os/bluestore: _do_truncate: no need to zero trailing block on truncate down
+00a71a0 os/bluestore: _do_write: only use append case for aligned eof
+9aaf1b1 os/bluestore: _do_zero_tail_extent: do not assume tail block is zeroed
+5abd9ff os/bluestore: _do_zero_tail_extent helper
+a810951 os/bluestore: break _do_zero out of _zero
+4ea630a os/bluestore: do not do gift beyond the max ratio
+fcbf0e5 global/signal_handler: fix warning
+8b45b0d os/bluestore: release wal_cleaning extents in order
+5fe8e94 os/bluestore: avoid temp released interval_map for wal cleanup
+1bc7edb os/bluestore/FreelistManager: detect overlapping extents on init
+91faa7e os/bluestore: trim onode_map when adding new items
+99f8192 os/bluestore: keep onode_map max_size
+3084be5 os/bluestore: wait for wal op that wrote cached tail before using
+5cf6669 os/bluestore: note txc seq for tail cache
+e98d0b2 os/bluestore: assign per-Sequencer seq to each TransContext
+08bb1ca ceph_test_objectstore: add AppendWalVsTailCache test
+a58ffab os/bluestore: fix fsck vs enodes
+87be6b0 os/bluestore: bluestore_inject_wal_apply_delay
+360d956 os/bluestore: reorder onode_map vs enode_set ctor/dtor
+4a9c5d7 os/bluestore: only cache tail in _pad_tail if we have the whole block
+5a2b36f os/bluestore: only use tail if tail_offset matches
+95a5f56 os/bluestore: fix off-by-one on caching tail block
+c1b42e9 os/bluestore/KernelDevice: print buffered flag in debug line
+8dc2c21 ceph-detect-init/run-tox.sh: FreeBSD: No init detect
+3995caf cls::rbd: read_peers: update last_read on next cls_cxx_map_get_vals
+557955a qa: update rest test cephfs calls
+c298959 osd: skip heartbeat_check for osd which we haven't sent ping to
+5c5c8df osd: initialize last_pg_stats_ack correctly on first pg_stats message sending
+9cf5302 osd: update osd_stat_updated field if we force to report to monitor
+aacadad cmake: add StandardPolicy.cc to librbd
+383d48b rbd-mirror: fix missing increment of iterators
+c7a0223 test/pybind/test_rados: force setting readonly cache mode
+23336a3 vstart: fix up cmake paths when VSTART_DEST is given
+e3dc7c7 os/ObjectStore: fix _update_op for split dest_cid
+ded7a77 qa/workunits/rbd: add basic failover/fallback test case
+9665b5b rbd-mirror: use 32bit image creation bid to avoid overflow
+4c2747f rbd-mirror: initial support for primary/non-primary handling
+a6eaa83 librbd: accessor for journal tag data
+3f15853 librbd: link primary mirror uuid to non-primary journal tag
+882bc06 rbd-mirror: update mirror peer sync state after completion
+668b41d librbd: track mirror peer replay state
+276e2eb rbd-mirror: allocate local journal tags from mirror remote peer tags
+bdee02c journal: helper method for retrieving tag struct
+4adc57e rbd: use consistent "non-primary" verbiage for image mirroring
+0489296 librbd: image demotion should record new demote journal event
+970a173 librbd: blacklist journal error while attempting to acquire lock
+e1c61f5 librbd: new demote journal event
+f42e761 rbd-mirror: framework for replay allocating tags in local journal
+5de360d librbd: journal reset should re-register the local client
+d624e65 rbd-mirror: cleanup debug log messages
+476da10 rbd-mirror: cross-reference local image via global image id
+909eb43 cls_rbd: mirroring directory reverse lookup from global id to image id
+347349c rbd-mirror: bootstrap requires the global image id
+6541e35 rbd-mirror: fix state output stream formatter
+7a3ffc9 librbd: support creating images with non-primary journals
+f01efc8 rbd-mirror: don't acquire lock for primary local images
+511039e rbd-mirror: tweaks to support creating mock test cases
+8e8deb3 rbd-mirror: bootstrap now has initial support for existing images
+d9ee1e7 rbd-mirror: properly initialize image copy start/end objects
+18e849f rbd-mirror: only delete ImageCtx after open failure
+cb72ac1 cls_journal: disallow client registration against missing journal
+0b0b224 journal: added helper to query single client record from disk
+418b474 rbd-mirror: convert ImageReplayer into templated class
+7d10eb0 librbd: helper methods to allocate tags associated to remote journals
+0f99d88 rbd-mirror: initial integration with librbd lock/journal policies
+a9549d6 librbd: basic policy for journal handling
+38cf63d librbd: optional 'force' parameter for request lock payload
+2afb2c3 librbd: basic policy for controlling the release of the exclusive lock
+dae789d librbd: journal type traits should be standalone
+e3ad07b ceph.in: fix python libpath for automake as well
+a21cc54 test/test_pool_create.sh: fix port
+a041e5c ceph.in: correct dev python path for automake builds
+9d6de84 store_test: apply an appended transaction half the time
+5bf281e Transaction: handle OP_TRY_RENAME in dump
+9634cec store_test: add a TryMoveRename test
+eb9e80c ObjectStore::Transaction::_update_op: handle OP_TRY_RENAME
+f3ebe46 rbd: rbd-mirroring: Updated rbd mirroring unit tests to reflect the new conditions to enable/disable image mirroring
+872b1b1 rbd: rbd-mirroring: Disable image mirroring depends on pool mirroring mode
+bc254c8 rbd: rbd-mirroring: Enabling image mirroring depends on pool mirroring mode
+668c8f9 script: subscription-manager support (part 3)
+97b74bd osd/ClassHandler: only dlclose() the classes not missing
+349c81f ceph_test_rados_api_pool: fix command for readonly cache-mode
+ad2e6f4 ceph.in: update for cmake path changes
+5da6ae8 vstart: update for cmake build path changes
+5ca6e31 osd: fix dump type match of pg_stat_t, add more fields
+f02f1d1 cmake: Cython modules building with cmake
+e29bac6 osd: make rs_RepRecovering_latency fully lowercased
+ab0844c osd: drop l_osd_hb_from perf counter
+9570f54 osd: fix wrong dump format for bench command
+44bbdb6 mailmap: Adam C. Emerson name normalization
+4a6f8b3 mailmap: Jenkins name normalization
+24d6c9a mailmap: Yan Jun affiliation
+64eaea2 mailmap: Zhao Junwang affiliation
+69d9a4b mailmap: Sarthak Munshi affiliation
+c502ee1 mailmap: Rust Shen affiliation
+3205e0c mailmap: Eric Cook affiliation
+8445255 mailmap: Ricardo Dias affiliation
+5bbfa8a mailmap: Karol Mroz affiliation
+83cd403 mailmap: James Liu affiliation
+4e091e7 mailmap: Robert LeBlanc name normalization
+a2e6aa6 mailmap: Yankun Li name normalization
+79aca70 mailmap: Josh Durgin name normalization
+342ca5c osd: improve dump_missing command a little
+c61ae3e rbd: rbd-mirroring: Added unit tests to test enable/disable image mirroring when pool-level mirroring changes
+490dc4f rbd: rbd-mirroring: Disable image mirroring when pool-level mirroring is disabled
+5624323 rbd: rbd-mirroring: Enable image mirroring when pool-level mirroring is enabled
+322bc65 osd: fix failure report handling during ms_handle_connect()
+e5a4d21 doc/release-notes: fix indents
+92d1857 doc: fix typo, duplicated content etc. for Jewel release notes
+1c9332b pybind/Makefile.am: Prevent race creating CYTHON_BUILD_DIR
+359d832 qa/workunits/cephtool/test.sh: fix cache mode tests
+90fe8e3 mon/OSDMonitor: require force flag to use obscure cache modes
+d7da688 osd: add 'proxy' cache mode
+f22676b mon: remove unused variable
+8e78ed3 pybinding: python3 fixes
+13bd851 rgw_file tests: allow override of owner_uid and owner_gid
+5b44a34 mon/MonClient: fix shutdown race
+a5b4460 common: fix race during optracker switches between enabled/disabled mode
+1611dfb doc: Remove Ceph Monitors do lots of fsync()      and change the ligature of "fl" to "f" and "l"
+b3930c5 Fixes headline different font size and type
+00cb296 mon: remove remove_legacy_versions()
+81792b3 rgw_file: set owner uid, gid, and Unix mode on new objects
+fa45089 Makefile-env.am: set a default for CEPH_BUILD_VIRTUALENV (part 2)
+239f164 test/system/*: use dynamically generated pool name
+0ab4813 script: subscription-manager support (part 2)
+1cbe2bd ceph_test_rados_api_misc: debug LibRadosMiscConnectFailure.ConnectFailure
+bdca28d messages/MOSDOp: clear reqid inc for v6 encoding
+038d1b6 osd: duplicated clear for peer_missing peering_missing is also cleared in clear_primary_state
+6577005 doc: amend Fixes instructions in SubmittingPatches
+3b551e6 cmake: conditionally install man pages
+1287846 autotools: add rbdmap to dist_man_MANS
+9f47a5b xxHash: fix submodule commit
+294cab1 osd/pg: publish stats to osd if pg is currently marked as down
+c0ceb8b osd/pg: publish stats to osd if we recovered from degraded
+c6a4f34 osd/pg: avoid publishing pg_stat twice in a short interval
+be3ebd8 osd/pg: fold sanity check into one single "if" qualification
+81d6929 osd/pg: drop unnecessary update_heartbeat_peers()
+71c41be test/system/rados_list_parallel: print oid if rados_write fails
+0e1fb2a doc/release-notes: rgw typo
+a4e63b5 doc: Updated CloudStack RBD documentation
+5bf340e Update SSL support a bit.
+8e87ce7 cmake: add FindOpenSSL.cmake
+3e8bb4b doc/rados/operations/crush: fix the formatting
+c3fcd83 rgw: fetch_remote_obj() fix handling of ERR_NOT_MODIFIED
+103f4b8 rgw: parse mtime only when header exists and not error
+7fe2657 rgw: fix lockdep false positive
+206cf5a cls_rbd: mirror_image_list should return global image id
+ccdb0d1 rgw:Use count fn in RGWUserBuckets for quota check
+96ae8bd (tag: v10.1.0) 10.1.0
+ec6f108 os/bluestore: ceph-bluefs-tool fixes
+525a42b os/bluestore/BlueStore.cc: when do omapclear, it should clear omap_head.
+53cd642 osd/ReplicatedPG: when do omapclear, it clear FLAG_OMAP  at the same time.
+aeb8141 osd/ReplicatedPG: When do omapclear, it should check object whether is omap.
+98caba1 osd/ReplicatedPG: For omap read ops, it should check object wether has omap
+e29257e osd/pg: set dirty_info if we succeeding in updating log from master
+15366b7 osd/pg: set dirty_info if proc_master_log() dirties info
+81fa2ca vstart: make -k with optional mon_num.
+f45094f osd/pg: set dirty_info after we dirty history
 5b3da26 Makefile-env.am: set a default for CEPH_BUILD_VIRTUALENV
 fa05d80 debian/control: try installing virtualenv first, if it exists
+4f74856 rbd: rbd-mirroring: Replayer registers in remote journal with mirror_uuid
+9872dcc rbd: allow librados to prune the command-line for config overrides
 eb583cb build/ops: in jessie virtualenv is in package virtualenv
+cbc9d42 test/rgw: use bucket for data checkpoint instead of data
+d615eaf rgw: data sync, update and flush high marker
+9bd7bd8 rgw: update data log even if canceling index operation
+8cb1478 rgw: break out of run_sync() if going down
+09d3cc7 rgw: fix bucket_instance_meta_info mtime json decoding
+f1743db xio: fix compilation against latest accelio
+9c616d2 mon/PGMonitor: do not send pg_creates for split PGs
+369d54f mon/PGMonitor: minor optimization in send_pg_creates
+c6334d4 rgw: convert plain object to versioned (with null version) when removing
+087de86 radosgw-admin: add explanation for EACCESS on 'realm pull'
+d951ee5 rgw: 'period commit' supplies user-readable error messages
+292d896 radosgw-admin: parse error messages from failed requests
+d01d983 packaging: rbdmap manpage
+c290946 RPM: prefer UID/GID 167 when creating ceph user/group
+d19b91b Objecter: dout log after assign tid
 c540835 python: avoid long paths (part 2)
+418d9be doc: rgw_region_root_pool option should be in [global]      add new option rgw_zonegroup_root_pool from Jewel.
+d2e281d rgw: S3: set EncodingType in ListBucketResult
+c804416 osd/OSD.cc: finish full_map_request every MOSDMap message.
+f2a752d osd/pg: reset handle during add_batch_sources_info()
+bb7ebe9 osd/pg: add an option to cap loop before we reset tp handle
+ac17e66 osd: fix log info
+757f8a0 osd/pg: kill _report_snap_collection_errors() method
+27b5d78 osd/pg: fix dirtying info without correctly setting dirty_info field
+03c5a93 osd: fix rare race for pg relevant events
+6ed9c3f osd: drop duplicated put of message
+88aa26d osd: drop unnecessary transaction cleanup
+d9949e2 Revert "osd: drop create events based on same_primary_since, not interval"
+2d3fe67 Adding documentation on how to use new dynamic throttle scheme
 93c790d python: avoid long paths
+c335855 test/librados/tier.cc: Fix Whiteout tests to force promotion
+926f8ed qa/workunits/rados/test.sh: check subprocess return values
+811af31 test/test-erasure-code.sh: disable pg temp priming
+d296609 msg/async: avoid log spam on throttle
+ad0ad2d mds: enable standby_for_fscid without rank
+93d6c86 mds: add mds_standby_for_fscid config
+bcf4953 messages: add MMDSBeacon::standby_for_fscid
+b9a1431 mds: s/standby_for_ns/standby_for_fscid/
+d4503e9 vstart: support creating multiple cephfs filesystems
+ccef489 mon: s/mdsmap/fsmap/ in "ceph status"
+bb7a286 mds: plain text prints for FSMap/Filesystem
+c6623c0 mon: config setting to skip FSMap::sanity
+99dd53d mds: pass features through to MDSMap::encode
+4fcb26d mds: fix mds_info_t::dump
+401c186 mds: fix whitespace in is_cluster_available
+94fd423 mds/FSMap: pass by ref in update_compat
+d4f2807 tools: update naming s/handle_mds_map/handle_fs_map/
+980f684 mon: name cleanup s/mdsmap_bl/fsmap_bl/
+122e0d3 messages: support features in MFSMap
+55f4ade release-notes: draft v10.1.0 release notes (manual edits)
+fb4e5cc rgw: Do not send a Content-Length header on a 304 response
+7051335 mailmap: Eric Lee affiliation
+a8428a9 release-notes: draft v10.1.0 release notes
+471fa29 rgw: Do not send a Content-Type on a '304 Not Modified' response
+918c12c monclient: avoid key renew storm on clock skew
+14dc847 cmake: fix mrun to handle cmake build structure
+d51f564 doc: fix wrong type of hyphen
+ccc3955 doc/release-notes: known issues with 10.1.0
 c9245e7 rgw: fix error message for zone delete
+a67f0cf rbd-mirror: asok commands to get status and flush on Mirror and Replayer level
+d66e8f6 rbd-mirror: async flush for ImageReplayer
 88d35e9 rgw_ldap:  move update of s->perm_mask
 6ffa9c7 rgw_ldap:  two bug fixes
 acea6ef rgw: fix zone delete message
 127c26f cls_rbd: throw error if set_features mask included non-mutable feature
+b50caa4 global/signale_handler: print thread name in signal handle.
 ba1dd35 mon: fix mixed-version MDSMonitor
+a6adb88 mon/PGMonitor: do not clobber pg_stat update with map_pg_creates
+0effb9e qa: test_rbdmap_RBDMAPFILE.sh workunit
+57c5754 doc/dev: add "Deploy a cluster for manual testing" section
+098fea2 rgw: add zone delete to rgw-admin help
+3a9a60b xio: xio_init needs to be called before any other xio function
 c4364b1 packaging: align radosgw package description
+7da141d rbdmap: manpage
 bbac766 rbd-mirror: fix long termination due to 30sec wait in Mirror::run loop
+c3adc90 rbdmap: drop init- prefix from logger tag
+27bbf3e rbdmap: default RBDMAPFILE to reasonable value
+a7a3658 systemd: set up environment in rbdmap unit file
 9f6ec70 qa: add workunit to run ceph_test_rbd_mirror
 9722dee journal: prevent race injecting new records into overflowed object
+f6408ec rgw_admin: new command to get bilog status
+9a6bf6c config_opts: disable filestore throttle soft backoff by default
 f5375dc packaging: added rbd-mirror startup scripts
 58d4734 upstart: new rbd-mirror daemon scripts
 8a0e472 systemd: new ceph-rbd-mirror scripts
+f0143bb OSD: bail out of _committed_osd_maps if we are shutting down
 ebbfdc7 test_pool_create.sh: put test files in the test dir so they are cleaned up
+2b3f01e rgw: don't record fetch remote obj in new multisite
+55c90fc rgw/rgw_admin:fix bug about list and stats command result of bucket-list and bucket-stats are incorrect when The first character of bucket name is underline
 d9017fb qa/workunits/rbd: new online maintenance op tests
+3e61f96 doc/release-notes: v10.1.0 draft jewel notes
+0fd674b osd/OSD: fix build_past_intervals_parallel
+2da5054 Revert "osd: build_past_intervals_parallel() add diagnostics before assert"
 fab2144 cls/rgw: fix use of timespan
+65858fe osd/PG: indicate in pg query output whether ignore_history_les would help
+5a429e0 doc/release-notes: 9.2.1 notes
 959ae39 ceph_test_rados_misc: shorten mount timeout
 8b9ed00 os/filestore: fix warning
 811b8f5 qa/workunits/rest/test.py: don't use newfs
+67d9e01 osd: print debug line when we hit max markdown count
+b67ac8c mon/PGMonitor: remove unnecessary check_down_pgs call
+ad25029 mon/PGMonitor: check_down_pgs against the consumed OSDMap
+4a66652 mon/PGMonitor: pass const OSDMap* to _try_mark_pg_stale
+39317e8 doc: very basic doc on mstart
 a619468 rbd: rbd-mirroring: Added unit tests to test image mirroring behaviour
 6ebb4f3 rbd: rbd-mirroring: Fixed if clause in mirror_image_get_info
 c472493 rbd: rbd-mirroring: Disable image mirroring when journaling feature is disabled
 ba6c0c9 rbd: rbd-mirroring: Enable image mirroring when journaling feature is enabled
 7385d4a rbd: rbd-mirroring: Enable image mirroring upon create with journaling feature
+865eb89 mds: forbid fragmenting dirfrag while scrub in progress
+2b01f49 rgw: don't return error if can't find data sync status
+f2fd396 rgw: log message cleanup
 425546a rgw: use current period id when it is empty for mdlog list and shard lock/unlock
 4e3cf59 osd: assert that we don't try to use an OSDMap > superblock.newest_map
 b839a06 osd: commit osdmaps before exposing them to PGs
 30e0f92 os/bluestore/BlueFS: Before reap ioct, it should wait io complete.
 f8cca62 mon: ignore msg without session attached
 b0d9b0d mon: remove 'mds setmap' command
+cd14bcc tools/cephfs/DataScan.cc: fake non-empty dirstat for injected directory
 13c7ba4 test: rbd-mirror: different log and asok location for local/remote contexts
+9bdf337 mds: avoid scrubbing (CDir*)NULL
 21e127b rbd-mirror: make remote context respect env and argv config params
+1b0e639 script: subscription-manager support
 c399d31 doc/architecture.rst: remove redundant word "across"
 3800e2b rbd-mirror: command line options should override environment
 06d22ef os/bluestore: rename need_size to want_size in StupidAllocator.
@@ -75,10 +405,22 @@ e81c81b rbd: rbd-mirror: PoolWatcher watches for mirroring enabled images
 50b53ea qa/workunits/rbd: rbd_mirror was extracting the incorrect image id
 f2e3988 qa/workunits/rbd: use unique logs for each rbd-mirror daemon
 3cf8952 journal: refetch active object before defaulting to new tag
+b6101e9 vstart: support creating multiple cephfs filesystems
+e9b70e4 mon: s/mdsmap/fsmap/ in "ceph status"
+dd58719 mds: plain text prints for FSMap/Filesystem
+b296629 mon: config setting to skip FSMap::sanity
+6839825 mds: pass features through to MDSMap::encode
 3895878 rgw/rgw_common.h: fix the RGWBucketInfo decoding
 ff3f0f8 ceph.spec.in: Make ceph-common require libcephfs1
 cd4751a test: rbd-mirror: add "switch to the next tag" test
+7a50b3d xio: add prefix to xio msgr logs
 f03f99d cls/rgw: fix FTBFS
+70bf219 mds: fix mds_info_t::dump
+57fa912 mds: fix whitespace in is_cluster_available
+4683ce4 mds/FSMap: pass by ref in update_compat
+7c9fd58 tools: update naming s/handle_mds_map/handle_fs_map/
+07f112e mon: name cleanup s/mdsmap_bl/fsmap_bl/
+f0253f6 messages: support features in MFSMap
 4411179 librbd: implement mirror image resync request API
 d1a6c7c librbd: track if a mirror image resync has been requested
 353f895 journal: add placeholder for marking clients are disconnected
@@ -92,6 +434,8 @@ c8aa842 librbd: expose image mirroring primary/secondary status
 e925948 rgw: Allow an implicit tenant in case of Keystone
 70ca604 rgw: data shard sync doesn't exit on error
 a028989 rgw ldap: s/memberattr/dnattr/;
+4e8e617 common/TrackedOp: Move tracking_enabled check into register_inflight_op()
+ad13e05 common/TrackedOp: Handle dump racing with constructor
 1c5e225 rgw: propagate low time precision for user operations
 809cabf rgw: use higher precision time when encoding / decoding external apis
 6bfaa7e utime: add gmtime_nsec()
@@ -120,6 +464,8 @@ bbde2f0 os/filestore: exit if we fail to remove any xattr
 82419db osd: pg: drop get_nrep() method, which is never used by anyone
 ed02ce5 osd: fix misnamed macro OSD_SUPERBLOCK_POBJECT
 917e06b doc/dev: add section on interrupting a running suite
+b5315d2 tools/cephfs/DataScan.cc: don't set directory inode's size to non-zero
+13ae262 client: add debug hook for requesting caps on open/lookup/getattr
 22fe493 qa/workunits/rados/test.sh: test tmap_migrate
 9ea6569 osd: pg: skip over update_heartbeat_peers() on non-primary pg
 3ce61eb osd: pg: drop stray_purged field, which is never used.
@@ -130,6 +476,7 @@ a835b0e osd: make os_flags an option
 4734ff9 osd: pg: drop assert_locked() method
 0755915 object: generate file_object string in a safer way
 014fe9b osd: avoid implicit return type cast
+98cf431 rgw: accept data only at the first time in response to a request
 0e1cc01 os/bluestore: only bluestore_sync_transaction = false & bluestore_sync_submit_transaction = false, it submit transaction.
 b45a7b7 os/bluestore: make fm alloca/release w/ other kv ops in the same transaction
 22bcac9 os/bluestore: fix deadlock for bluestore_sync_transaction = true.
@@ -149,6 +496,10 @@ a60f38d test: remove broken negative test cases
 e1fa689 rbd-mirror: simple image bootstrap state machine
 3b1db9b rbd-mirror: helper state machine for opening local image
 d61293d rbd-mirror: helper state machine for closing librbd images
+be098c8 common/TrackedOp: Missed locking when examining "events"
+7a5fbeb messages: Use atomic<bool> since these are used without locking
+17c645f CLEANUP: Move locking into dump_ops_in_flight()/dump_historic_ops()
+be5bed8 mds, osd: Fix missing locking for "dump_blocked_ops"
 71dc051 test/cli/crushtool: default tunables are now firefly
 f945170 test/cli/osdmaptool: default tunables are now firefly
 7b1f1e8 ceph-detect-init: add test for squeeze
@@ -182,9 +533,16 @@ f41cb16 osd: drop create events based on same_primary_since, not interval
 4ed894d osd: drop unused primary arg to handle_pg_peering_evt
 439bdbe osd: use handle_pg_peering_evt for pg creations
 26d4f2e mon/PGMonitor: reliably mark PGs state
+1d03bc1 pybind: add flock to cephfs bindings
+7043b59 pybind: fix libcephfs getxattr
+85cfa4f libcephfs: fix crash on getpwd
+f9ab9cf pybind: fix cephfs python test
+aed1872 qa: add test_python for cephfs
 cc7e232 osd/OSDMap: fix typo in summarize_mapping_stats
 aec9cd2 SubProcess: Avoid buffer corruption when calling err()
 28982ca mds: change the 'fs remove_data_pool' to 'fs rm_data_pool'
+4ad2cfb crush: Test for crash on trying to load an invalid map
+3d61ba0 [rgw] Check return code in RGWFileHandle::write
 0dbcb41 osd: min_write_recency_for_promote & min_read_recency_for_promote are only used in cache pool
 e0983cd osd: fix typo
 e68d610 osd: skip over last queue_transaction() if the transaction is empty
@@ -236,6 +594,7 @@ bdcff15 mds: fix FSMap upgrade on mixed mon versions
 e52f7b4 mds: fix FSMap upgrade with daemons in the map
 1ea1735 osd: fix wrong counter for batch objects removal during remove_dir()
 12d151f osd: initialize last_recalibrate field at construction
+f1a4490 ceph.spec.in: disable lttng and babeltrace explicitly
 996be8e qa/workunits/rbd: use POSIX function definition
 d9af48a ReplicatedPG: be more careful about calling publish_stats_to_osd() correctly
 8616763 rgw: disable swift versioning by default
@@ -297,7 +656,9 @@ e2b297a osd: introduce command of 'osd tier rm'
 b0b1dfb mon: Introduce command of 'auth rm'
 439991a osd/ScrubStore: remove unused function
 223191f common/config_opts.h: add osd_crush_initial_weight
+cecdc49 osd: cleanup: Specify both template types for create_request()
 99ec183 mds: fix stray purging in 'stripe_count > 1' case
+7663b9f crushtool: Don't crash when called on a file that isn't a crushmap
 2408f8d rgw: store system object meta in cache when creating it
 4d59b1d rgw:bucket link now set the bucket.instance acl
 b988f79 rgw_admin: policy dump --xml backward compatibility
@@ -484,6 +845,7 @@ f29091b mds: check dirfrag rstat when dirfrag is fetched
 5810eb0 tools/rados: reduce rados put memory usage by op_size
 11222c5 cmake: Remove duplicate find_package libcurl line.
 d6a48d9 cls_hello: Fix grammatical error in description comment
+f4bd1fc mds: allow client to request caps when opening file
 5b57065 doc/dev: integrate testing into the narrative
 c823018 msg: remove duplicated code - local_delivery will now call 'enqueue'
 e904670 Event: fix clock skew problem
@@ -806,6 +1168,7 @@ df25867 journal: active and minimum set should always grow
 dff62e4 rgw: use pimpl pattern for RGWPeriodHistory
 39671f1 doc: final v9.2.1 release notes
 2a80042 rbd/run_cli_tests.sh: Reflect test failures
+614597b AsyncMessenger: remove experiment feature
 ca50f42 doc: batch small fixes, including typo, syntax etc.
 dc7e027 doc: fix typo
 0cee333 mds: remove stray dentry from delayed eval list after calling eval_stray()
@@ -861,6 +1224,8 @@ cd0389d ceph-helpers.sh: Deleting forgoten btrfs subvolumes
 ba70bdb config: increase default async op threads
 557c3bf osd: always cleanup the scrub results
 b0b4021 pybind: fix the FTBFS introduced by d0af316
+23f3d30 qa/workunits: add test for setting scrub priority
+3241011 osd: add the support of per pool scrub priority
 a7f3a65 mds: avoid creating unnecessary snap dentry/inode
 798ad64 mds: set multiversion inode's first
 f7fb2cb mds: fix open snap parents tracking
@@ -2384,6 +2749,7 @@ e85ffac scripts/run-coverity: fix upload process
 83ca830 bluestore/NVMEDevice: fix fd leak
 a5714fb bluestore/NVMEDevice: fix wrong remove_device logic
 06f3837 bluestore/NVMEDevice: fix compiling error
+0746470 Mon: show the pool quota info on ceph df detail command
 caed882 os/bluestore: insert new onode to the front position of onode LRU
 7c86775 bluestore/bluefs_types: fix imcomplete output message
 8167a22 mds: function parameter 'df' should be passed by reference
@@ -4661,6 +5027,7 @@ af8d6ec osd: add support of pin/unpin objects in cache tier
 28b7205 rados: add the support of pin/unpin object in cache tier
 fa3822c pybind/cephfs: add symlink and its unit test
 19d0a59 Fix Makefile in example/librados file.
+d5b3926 PG: pg down state blocked by osd.x, lost osd.x cannot solve peering stuck.
 904c0e9 pybind: Use basestring as string type for Python 2
 ab6b923 pybind: Add Python 3 support for rados and rbd modules
 0278f5f doc/release-notes: drop 0.94.4 plaintext
diff --git a/ceph.spec b/ceph.spec
index 84c1a29..5190979 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -27,6 +27,10 @@
 %bcond_with selinux
 %endif
 
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
 
 %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
@@ -62,11 +66,6 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 %{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
 # unify libexec for all targets
 %global _libexecdir %{_exec_prefix}/lib
 
@@ -75,7 +74,7 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 # common
 #################################################################################
 Name:		ceph
-Version:	10.1.0
+Version:	10.1.1
 Release:	0%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
@@ -186,7 +185,7 @@ BuildRequires:  boost-random
 BuildRequires:	python-argparse
 %endif
 # lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:	lttng-ust-devel
 BuildRequires:	libbabeltrace-devel
@@ -685,6 +684,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
                 --libexecdir=%{_libexecdir} \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?rhel} && ! 0%{?centos}
+                --enable-subman \
+%endif
 %if 0%{?_with_systemd}
 		--with-systemdsystemunitdir=%_unitdir \
 %endif
@@ -702,6 +704,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 %endif
 		--with-librocksdb-static=check \
 		--with-radosgw \
+%if %{without lttng}
+		--without-lttng \
+		--without-babeltrace \
+%endif
 		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
 		%{?_with_tcmalloc} \
@@ -858,7 +864,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
@@ -977,7 +983,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
 %{_bindir}/rbdmap
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_bindir}/rbd-replay-prep
 %endif
 %{_bindir}/ceph-post-file
@@ -994,6 +1000,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
 %{_mandir}/man8/rbd-replay-many.8*
 %{_mandir}/man8/rbd-replay-prep.8*
@@ -1017,19 +1024,22 @@ rm -rf $RPM_BUILD_ROOT
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
 
 %pre common
-CEPH_GROUP_ID=""
-CEPH_USER_ID=""
+CEPH_GROUP_ID=167
+CEPH_USER_ID=167
 %if 0%{?rhel} || 0%{?fedora}
-CEPH_GROUP_ID="-g 167"
-CEPH_USER_ID="-u 167"
-%endif
-%if 0%{?rhel} || 0%{?fedora}
-%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
-%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%{_sbindir}/groupadd ceph -g $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph -u $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 %if 0%{?suse_version}
-getent group ceph >/dev/null || groupadd -r ceph
-getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+if ! getent group ceph >/dev/null ; then
+    CEPH_GROUP_ID_OPTION=""
+    getent group $CEPH_GROUP_ID >/dev/null || CEPH_GROUP_ID_OPTION="-g $CEPH_GROUP_ID"
+    groupadd ceph $CEPH_GROUP_ID_OPTION -r 2>/dev/null || :
+fi
+if ! getent passwd ceph >/dev/null ; then
+    CEPH_USER_ID_OPTION=""
+    getent passwd $CEPH_USER_ID >/dev/null || CEPH_USER_ID_OPTION="-u $CEPH_USER_ID"
+    useradd ceph $CEPH_USER_ID_OPTION -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 exit 0
 
@@ -1182,6 +1192,9 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-osd.8*
+%if 0%{?rhel} && ! 0%{?centos}
+/etc/cron.hourly/subman
+%endif
 %if 0%{?_with_systemd}
 %{_unitdir}/ceph-osd at .service
 %{_unitdir}/ceph-osd.target
@@ -1220,7 +1233,7 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so.*
 %endif
 
@@ -1244,7 +1257,7 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so
 %endif
 %{_bindir}/librados-config
@@ -1279,7 +1292,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so.*
 %endif
 
@@ -1299,7 +1312,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so
 %endif
 
diff --git a/ceph.spec.in b/ceph.spec.in
index b52d7e2..3a5a6f7 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -27,6 +27,10 @@
 %bcond_with selinux
 %endif
 
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
 
 %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
@@ -62,11 +66,6 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 %{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
 # unify libexec for all targets
 %global _libexecdir %{_exec_prefix}/lib
 
@@ -186,7 +185,7 @@ BuildRequires:  boost-random
 BuildRequires:	python-argparse
 %endif
 # lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:	lttng-ust-devel
 BuildRequires:	libbabeltrace-devel
@@ -685,6 +684,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
                 --libexecdir=%{_libexecdir} \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?rhel} && ! 0%{?centos}
+                --enable-subman \
+%endif
 %if 0%{?_with_systemd}
 		--with-systemdsystemunitdir=%_unitdir \
 %endif
@@ -702,6 +704,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 %endif
 		--with-librocksdb-static=check \
 		--with-radosgw \
+%if %{without lttng}
+		--without-lttng \
+		--without-babeltrace \
+%endif
 		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
 		%{?_with_tcmalloc} \
@@ -858,7 +864,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
@@ -977,7 +983,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
 %{_bindir}/rbdmap
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_bindir}/rbd-replay-prep
 %endif
 %{_bindir}/ceph-post-file
@@ -994,6 +1000,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
 %{_mandir}/man8/rbd-replay-many.8*
 %{_mandir}/man8/rbd-replay-prep.8*
@@ -1017,19 +1024,22 @@ rm -rf $RPM_BUILD_ROOT
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
 
 %pre common
-CEPH_GROUP_ID=""
-CEPH_USER_ID=""
+CEPH_GROUP_ID=167
+CEPH_USER_ID=167
 %if 0%{?rhel} || 0%{?fedora}
-CEPH_GROUP_ID="-g 167"
-CEPH_USER_ID="-u 167"
-%endif
-%if 0%{?rhel} || 0%{?fedora}
-%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
-%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%{_sbindir}/groupadd ceph -g $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph -u $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 %if 0%{?suse_version}
-getent group ceph >/dev/null || groupadd -r ceph
-getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+if ! getent group ceph >/dev/null ; then
+    CEPH_GROUP_ID_OPTION=""
+    getent group $CEPH_GROUP_ID >/dev/null || CEPH_GROUP_ID_OPTION="-g $CEPH_GROUP_ID"
+    groupadd ceph $CEPH_GROUP_ID_OPTION -r 2>/dev/null || :
+fi
+if ! getent passwd ceph >/dev/null ; then
+    CEPH_USER_ID_OPTION=""
+    getent passwd $CEPH_USER_ID >/dev/null || CEPH_USER_ID_OPTION="-u $CEPH_USER_ID"
+    useradd ceph $CEPH_USER_ID_OPTION -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 exit 0
 
@@ -1182,6 +1192,9 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-osd.8*
+%if 0%{?rhel} && ! 0%{?centos}
+/etc/cron.hourly/subman
+%endif
 %if 0%{?_with_systemd}
 %{_unitdir}/ceph-osd at .service
 %{_unitdir}/ceph-osd.target
@@ -1220,7 +1233,7 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so.*
 %endif
 
@@ -1244,7 +1257,7 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so
 %endif
 %{_bindir}/librados-config
@@ -1279,7 +1292,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so.*
 %endif
 
@@ -1299,7 +1312,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so
 %endif
 
diff --git a/configure b/configure
index 158bcb7..01072fd 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 10.1.0.
+# Generated by GNU Autoconf 2.69 for ceph 10.1.1.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='10.1.0'
-PACKAGE_STRING='ceph 10.1.0'
+PACKAGE_VERSION='10.1.1'
+PACKAGE_STRING='ceph 10.1.1'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -783,6 +783,8 @@ PYTHON_LDFLAGS
 PYTHON_CFLAGS
 PYTHON_CONFIG_CHECK
 CYTHON_CHECK
+ENABLE_SUBMAN_FALSE
+ENABLE_SUBMAN_TRUE
 ENABLE_SERVER_FALSE
 ENABLE_SERVER_TRUE
 ENABLE_CLIENT_FALSE
@@ -978,6 +980,7 @@ with_osd
 with_mds
 enable_client
 enable_server
+enable_subman
 with_cryptopp
 with_nss
 enable_gitversion
@@ -1579,7 +1582,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 10.1.0 to adapt to many kinds of systems.
+\`configure' configures ceph 10.1.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1650,7 +1653,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 10.1.0:";;
+     short | recursive ) echo "Configuration of ceph 10.1.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1671,6 +1674,7 @@ Optional Features:
   --disable-silent-rules  verbose build output (undo: "make V=0")
   --enable-client         enable client-side build
   --enable-server         enable server-side build
+  --enable-subman         enable subman
   --enable-gitversion     build Ceph with git version string
   --enable-coverage       enable code coverage tracking
   --enable-pgrefdebugging enable pg ref debugging
@@ -1833,7 +1837,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 10.1.0
+ceph configure 10.1.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2909,7 +2913,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 10.1.0, which was
+It was created by ceph $as_me 10.1.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -16404,7 +16408,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='10.1.0'
+ VERSION='10.1.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16919,6 +16923,7 @@ $as_echo "#define DARWIN 1" >>confdefs.h
 	;;
 linux*)
 	linux="yes"
+	CFLAGS="-D_LARGEFILE64_SOURCE ${CFLAGS}"
 	;;
 freebsd*)
 	freebsd="yes"
@@ -17577,6 +17582,23 @@ fi
 
 #AS_IF([test "$enable_server" = "yes"], [AC_DEFINE([WITH_MON, WITH_OSD, WITH_MDS, ENABLE_SERVER])])
 
+# subscription manager?
+# Check whether --enable-subman was given.
+if test "${enable_subman+set}" = set; then :
+  enableval=$enable_subman;
+else
+  enable_subman=no
+fi
+
+ if test "x$enable_subman" != "xno"; then
+  ENABLE_SUBMAN_TRUE=
+  ENABLE_SUBMAN_FALSE='#'
+else
+  ENABLE_SUBMAN_TRUE='#'
+  ENABLE_SUBMAN_FALSE=
+fi
+
+
 # cython is required to build python bindings for libraries
 if test x"$with_cython" = xyes; then
     # Extract the first word of "cython", so it can be a program name with args.
@@ -20755,7 +20777,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 20758 "configure" */
+/* #line 20780 "configure" */
 public class Test {
 }
 EOF
@@ -25525,6 +25547,10 @@ if test -z "${ENABLE_SERVER_TRUE}" && test -z "${ENABLE_SERVER_FALSE}"; then
   as_fn_error $? "conditional \"ENABLE_SERVER\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${ENABLE_SUBMAN_TRUE}" && test -z "${ENABLE_SUBMAN_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_SUBMAN\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_GOOD_YASM_ELF64_TRUE}" && test -z "${WITH_GOOD_YASM_ELF64_FALSE}"; then
   as_fn_error $? "conditional \"WITH_GOOD_YASM_ELF64\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -26074,7 +26100,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 10.1.0, which was
+This file was extended by ceph $as_me 10.1.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -26140,7 +26166,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 10.1.0
+ceph config.status 10.1.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 4db6d55..efd760a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [10.1.0], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [10.1.1], [ceph-devel at vger.kernel.org])
 
 AX_CXX_COMPILE_STDCXX_11(, mandatory)
 
@@ -63,6 +63,7 @@ darwin*)
 	;;
 linux*)
 	linux="yes"
+	CFLAGS="-D_LARGEFILE64_SOURCE ${CFLAGS}"
 	;;
 freebsd*)
 	freebsd="yes"
@@ -207,6 +208,13 @@ AC_ARG_ENABLE([server],
 AM_CONDITIONAL(ENABLE_SERVER, test "$enable_server" = "yes")
 #AS_IF([test "$enable_server" = "yes"], [AC_DEFINE([WITH_MON, WITH_OSD, WITH_MDS, ENABLE_SERVER])])
 
+# subscription manager?
+AC_ARG_ENABLE([subman],
+	[AS_HELP_STRING([--enable-subman], [enable subman])],
+	[],
+	[enable_subman=no])
+AM_CONDITIONAL([ENABLE_SUBMAN], test "x$enable_subman" != "xno")
+
 # cython is required to build python bindings for libraries
 if test x"$with_cython" = xyes; then
     AC_CHECK_PROG(CYTHON_CHECK, cython, yes)
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 75c0ce7..c4bcfc7 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -33,4 +33,5 @@ EXTRA_DIST = \
 	man/8/rbd-replay-many.rst	\
 	man/8/rbd-replay-prep.rst	\
 	man/8/rbd-replay.rst	\
-	man/8/rbd.rst
+	man/8/rbd.rst           \
+	man/8/rbdmap.rst
diff --git a/doc/Makefile.in b/doc/Makefile.in
index 25b773c..c3bae77 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -361,7 +361,8 @@ EXTRA_DIST = \
 	man/8/rbd-replay-many.rst	\
 	man/8/rbd-replay-prep.rst	\
 	man/8/rbd-replay.rst	\
-	man/8/rbd.rst
+	man/8/rbd.rst           \
+	man/8/rbdmap.rst
 
 all: all-am
 
diff --git a/doc/man/8/rbdmap.rst b/doc/man/8/rbdmap.rst
new file mode 100644
index 0000000..145a1e2
--- /dev/null
+++ b/doc/man/8/rbdmap.rst
@@ -0,0 +1,48 @@
+:orphan:
+
+=========================================
+ rbdmap -- map RBD devices at boot time
+=========================================
+
+.. program:: rbdmap
+
+Synopsis
+========
+
+| **rbdmap map**
+| **rbdmap unmap**
+
+
+Description
+===========
+
+**rbdmap** is a shell script that can be run manually by the system
+administrator at any time, or automatically at boot time by the init system
+(sysvinit, upstart, systemd). The script looks for an environment variable
+``RBDMAPFILE``, which defaults to ``/etc/ceph/rbdmap``. This file is
+expected to contain a list of RBD images and, possibly, parameters to be
+passed to the underlying ``rbd`` command. The syntax of
+``/etc/ceph/rbdmap`` is described in the comments at the top of that file.
+
+The script mounts devices after mapping, and unmounts them before
+unmapping.
+
+
+Options
+=======
+
+None
+
+
+Availability
+============
+
+**rbdmap** is part of Ceph, a massively scalable, open-source, distributed
+storage system. Please refer to the Ceph documentation at
+http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`rbd <rbd>`\(8),
diff --git a/etc/sysconfig/ceph b/etc/sysconfig/ceph
index 4068f29..61e941d 100644
--- a/etc/sysconfig/ceph
+++ b/etc/sysconfig/ceph
@@ -4,7 +4,7 @@
 #
 
 # Increase tcmalloc cache size
-TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=128MB
+TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
 
 ## use jemalloc instead of tcmalloc
 #
diff --git a/man/Makefile-client.am b/man/Makefile-client.am
index b03c9bf..ae3dd7a 100644
--- a/man/Makefile-client.am
+++ b/man/Makefile-client.am
@@ -20,7 +20,8 @@ dist_man_MANS += \
 	rbd-nbd.8 \
 	rbd-replay.8 \
 	rbd-replay-many.8 \
-	rbd-replay-prep.8
+	rbd-replay-prep.8 \
+	rbdmap.8
 endif
 
 if WITH_CEPHFS
diff --git a/man/Makefile.in b/man/Makefile.in
index b02835d..6a80e84 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -100,7 +100,8 @@ DIST_COMMON = $(srcdir)/Makefile-client.am \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-nbd.8 \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay.8 \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-many.8 \
- at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-prep.8
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-prep.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbdmap.8
 
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_MAN_PAGES_TRUE at am__append_4 = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_MAN_PAGES_TRUE@	cephfs.8
diff --git a/man/ceph-authtool.8 b/man/ceph-authtool.8
index e590560..075efe3 100644
--- a/man/ceph-authtool.8
+++ b/man/ceph-authtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-AUTHTOOL" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-AUTHTOOL" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-authtool \- ceph keyring manipulation tool
 .
diff --git a/man/ceph-clsinfo.8 b/man/ceph-clsinfo.8
index 1f5133f..d1974b0 100644
--- a/man/ceph-clsinfo.8
+++ b/man/ceph-clsinfo.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CLSINFO" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-CLSINFO" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-clsinfo \- show class object information
 .
diff --git a/man/ceph-conf.8 b/man/ceph-conf.8
index c6fa297..73ea29c 100644
--- a/man/ceph-conf.8
+++ b/man/ceph-conf.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CONF" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-CONF" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-conf \- ceph conf file tool
 .
diff --git a/man/ceph-create-keys.8 b/man/ceph-create-keys.8
index 56dc669..d9c5ea4 100644
--- a/man/ceph-create-keys.8
+++ b/man/ceph-create-keys.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CREATE-KEYS" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-CREATE-KEYS" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-create-keys \- ceph keyring generate tool
 .
diff --git a/man/ceph-debugpack.8 b/man/ceph-debugpack.8
index c91b7b5..f6efc37 100644
--- a/man/ceph-debugpack.8
+++ b/man/ceph-debugpack.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEBUGPACK" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-DEBUGPACK" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-debugpack \- ceph debug packer utility
 .
diff --git a/man/ceph-dencoder.8 b/man/ceph-dencoder.8
index 2d68f36..afe930d 100644
--- a/man/ceph-dencoder.8
+++ b/man/ceph-dencoder.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DENCODER" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-DENCODER" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-dencoder \- ceph encoder/decoder utility
 .
diff --git a/man/ceph-deploy.8 b/man/ceph-deploy.8
index 2fd80e2..0ca7600 100644
--- a/man/ceph-deploy.8
+++ b/man/ceph-deploy.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEPLOY" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-DEPLOY" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-deploy \- Ceph deployment tool
 .
diff --git a/man/ceph-detect-init.8 b/man/ceph-detect-init.8
index a124107..21a65f4 100644
--- a/man/ceph-detect-init.8
+++ b/man/ceph-detect-init.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DETECT-INIT" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-DETECT-INIT" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-detect-init \- display the init system Ceph should use
 .
diff --git a/man/ceph-disk.8 b/man/ceph-disk.8
index 1a59976..fa4728e 100644
--- a/man/ceph-disk.8
+++ b/man/ceph-disk.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DISK" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-DISK" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-disk \- Ceph disk utility for OSD
 .
diff --git a/man/ceph-fuse.8 b/man/ceph-fuse.8
index a89df0a..0a03a44 100644
--- a/man/ceph-fuse.8
+++ b/man/ceph-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-FUSE" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-FUSE" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-fuse \- FUSE-based client for ceph
 .
diff --git a/man/ceph-mds.8 b/man/ceph-mds.8
index 13fb217..ac0cc19 100644
--- a/man/ceph-mds.8
+++ b/man/ceph-mds.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MDS" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-MDS" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-mds \- ceph metadata server daemon
 .
diff --git a/man/ceph-mon.8 b/man/ceph-mon.8
index 6ab7488..d2a8707 100644
--- a/man/ceph-mon.8
+++ b/man/ceph-mon.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MON" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-MON" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-mon \- ceph monitor daemon
 .
diff --git a/man/ceph-osd.8 b/man/ceph-osd.8
index fff836f..a7c6345 100644
--- a/man/ceph-osd.8
+++ b/man/ceph-osd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-OSD" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-OSD" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-osd \- ceph object storage daemon
 .
diff --git a/man/ceph-post-file.8 b/man/ceph-post-file.8
index d94d568..b49d538 100644
--- a/man/ceph-post-file.8
+++ b/man/ceph-post-file.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-POST-FILE" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-POST-FILE" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-post-file \- post files for ceph developers
 .
diff --git a/man/ceph-rbdnamer.8 b/man/ceph-rbdnamer.8
index 0919425..28278a7 100644
--- a/man/ceph-rbdnamer.8
+++ b/man/ceph-rbdnamer.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RBDNAMER" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-RBDNAMER" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-rbdnamer \- udev helper to name RBD devices
 .
diff --git a/man/ceph-rest-api.8 b/man/ceph-rest-api.8
index c431a6b..6d42a39 100644
--- a/man/ceph-rest-api.8
+++ b/man/ceph-rest-api.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-REST-API" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-REST-API" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-rest-api \- ceph RESTlike administration server
 .
diff --git a/man/ceph-run.8 b/man/ceph-run.8
index 500f401..f1f9743 100644
--- a/man/ceph-run.8
+++ b/man/ceph-run.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RUN" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-RUN" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-run \- restart daemon on core dump
 .
diff --git a/man/ceph-syn.8 b/man/ceph-syn.8
index 61e790a..05f5a91 100644
--- a/man/ceph-syn.8
+++ b/man/ceph-syn.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-SYN" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH-SYN" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph-syn \- ceph synthetic workload generator
 .
diff --git a/man/ceph.8 b/man/ceph.8
index 945a40f..9ee403b 100644
--- a/man/ceph.8
+++ b/man/ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPH" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 ceph \- ceph administration tool
 .
diff --git a/man/cephfs.8 b/man/cephfs.8
index d1f29ed..2772697 100644
--- a/man/cephfs.8
+++ b/man/cephfs.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPHFS" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CEPHFS" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 cephfs \- ceph file system options utility
 .
diff --git a/man/crushtool.8 b/man/crushtool.8
index 793e820..c7ebbd7 100644
--- a/man/crushtool.8
+++ b/man/crushtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CRUSHTOOL" "8" "March 24, 2016" "dev" "Ceph"
+.TH "CRUSHTOOL" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 crushtool \- CRUSH map manipulation tool
 .
diff --git a/man/librados-config.8 b/man/librados-config.8
index 21c6884..d6567e5 100644
--- a/man/librados-config.8
+++ b/man/librados-config.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "LIBRADOS-CONFIG" "8" "March 24, 2016" "dev" "Ceph"
+.TH "LIBRADOS-CONFIG" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 librados-config \- display information about librados
 .
diff --git a/man/monmaptool.8 b/man/monmaptool.8
index 8b62123..ac6f7f2 100644
--- a/man/monmaptool.8
+++ b/man/monmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MONMAPTOOL" "8" "March 24, 2016" "dev" "Ceph"
+.TH "MONMAPTOOL" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 monmaptool \- ceph monitor cluster map manipulation tool
 .
diff --git a/man/mount.ceph.8 b/man/mount.ceph.8
index 52de2de..eb9ba29 100644
--- a/man/mount.ceph.8
+++ b/man/mount.ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MOUNT.CEPH" "8" "March 24, 2016" "dev" "Ceph"
+.TH "MOUNT.CEPH" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 mount.ceph \- mount a ceph file system
 .
diff --git a/man/osdmaptool.8 b/man/osdmaptool.8
index 23f0138..c369b14 100644
--- a/man/osdmaptool.8
+++ b/man/osdmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "OSDMAPTOOL" "8" "March 24, 2016" "dev" "Ceph"
+.TH "OSDMAPTOOL" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 osdmaptool \- ceph osd cluster map manipulation tool
 .
diff --git a/man/rados.8 b/man/rados.8
index 37e2d3d..5cf4aac 100644
--- a/man/rados.8
+++ b/man/rados.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOS" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RADOS" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rados \- rados object storage utility
 .
diff --git a/man/radosgw-admin.8 b/man/radosgw-admin.8
index 0efefb7..d77aff4 100644
--- a/man/radosgw-admin.8
+++ b/man/radosgw-admin.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW-ADMIN" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RADOSGW-ADMIN" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 radosgw-admin \- rados REST gateway user administration utility
 .
diff --git a/man/radosgw.8 b/man/radosgw.8
index afaa588..1402859 100644
--- a/man/radosgw.8
+++ b/man/radosgw.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RADOSGW" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 radosgw \- rados REST gateway
 .
diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8
index 056abb9..744f90b 100644
--- a/man/rbd-fuse.8
+++ b/man/rbd-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-FUSE" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD-FUSE" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd-fuse \- expose rbd images as files
 .
diff --git a/man/rbd-mirror.8 b/man/rbd-mirror.8
index 289c5fd..7f99e32 100644
--- a/man/rbd-mirror.8
+++ b/man/rbd-mirror.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-MIRROR" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD-MIRROR" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd-mirror \- Ceph daemon for mirroring RBD images
 .
diff --git a/man/rbd-nbd.8 b/man/rbd-nbd.8
index 0211dc6..38b46eb 100644
--- a/man/rbd-nbd.8
+++ b/man/rbd-nbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-NBD" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD-NBD" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd-nbd \- map rbd images to nbd device
 .
diff --git a/man/rbd-replay-many.8 b/man/rbd-replay-many.8
index 8ae20a1..ea3f0dd 100644
--- a/man/rbd-replay-many.8
+++ b/man/rbd-replay-many.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-MANY" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD-REPLAY-MANY" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay-many \- replay a rados block device (RBD) workload on several clients
 .
diff --git a/man/rbd-replay-prep.8 b/man/rbd-replay-prep.8
index ebb7c58..c0a080f 100644
--- a/man/rbd-replay-prep.8
+++ b/man/rbd-replay-prep.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-PREP" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD-REPLAY-PREP" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay-prep \- prepare captured rados block device (RBD) workloads for replay
 .
diff --git a/man/rbd-replay.8 b/man/rbd-replay.8
index c02ea9f..593563b 100644
--- a/man/rbd-replay.8
+++ b/man/rbd-replay.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD-REPLAY" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay \- replay rados block device (RBD) workloads
 .
diff --git a/man/rbd.8 b/man/rbd.8
index c2be8e6..a0e5603 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBD" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
 rbd \- manage rados block device (RBD) images
 .
diff --git a/man/ceph-run.8 b/man/rbdmap.8
similarity index 51%
copy from man/ceph-run.8
copy to man/rbdmap.8
index 500f401..b746517 100644
--- a/man/ceph-run.8
+++ b/man/rbdmap.8
@@ -1,8 +1,8 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RUN" "8" "March 24, 2016" "dev" "Ceph"
+.TH "RBDMAP" "8" "April 06, 2016" "dev" "Ceph"
 .SH NAME
-ceph-run \- restart daemon on core dump
+rbdmap \- map RBD devices at boot time
 .
 .nr rst2man-indent-level 0
 .
@@ -32,30 +32,33 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 ..
 .SH SYNOPSIS
 .nf
-\fBceph\-run\fP \fIcommand\fP ...
+\fBrbdmap map\fP
+\fBrbdmap unmap\fP
 .fi
 .sp
 .SH DESCRIPTION
 .sp
-\fBceph\-run\fP is a simple wrapper that will restart a daemon if it exits
-with a signal indicating it crashed and possibly core dumped (that is,
-signals 3, 4, 5, 6, 8, or 11).
+\fBrbdmap\fP is a shell script that can be run manually by the system
+administrator at any time, or automatically at boot time by the init system
+(sysvinit, upstart, systemd). The script looks for an environment variable
+\fBRBDMAPFILE\fP, which defaults to \fB/etc/ceph/rbdmap\fP\&. This file is
+expected to contain a list of RBD images and, possibly, parameters to be
+passed to the underlying \fBrbd\fP command. The syntax of
+\fB/etc/ceph/rbdmap\fP is described in the comments at the top of that file.
 .sp
-The command should run the daemon in the foreground. For Ceph daemons,
-that means the \fB\-f\fP option.
+The script mounts devices after mapping, and unmounts them before
+unmapping.
 .SH OPTIONS
 .sp
 None
 .SH AVAILABILITY
 .sp
-\fBceph\-run\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
-the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
+\fBrbdmap\fP is part of Ceph, a massively scalable, open\-source, distributed
+storage system. Please refer to the Ceph documentation at
+\fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
-\fBceph\fP(8),
-\fBceph\-mon\fP(8),
-\fBceph\-mds\fP(8),
-\fBceph\-osd\fP(8)
+\fBrbd\fP(8),
 .SH COPYRIGHT
 2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA
 .\" Generated by docutils manpage writer.
diff --git a/src/.git_version b/src/.git_version
index a8085ff..2f3b43d 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-96ae8bd25f31862dbd5302f304ebf8bf1166aba6
-v10.1.0
+ce50389b773fe7f72fca40a3dd69cfe6613eaeb1
+v10.1.1
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index 2044647..2fb22a6 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -297,7 +297,8 @@ LIBCIVETWEB_DEPS =
 DENCODER_SOURCES =
 DENCODER_DEPS =
 
-# put virtualenvs in this directory for build
-CEPH_BUILD_VIRTUALENV="/tmp/"
+# put virtualenvs in this directory
+# otherwise it may overflow #! 80 kernel limit
+export CEPH_BUILD_VIRTUALENV = /tmp
 
 radoslibdir = $(libdir)/rados-classes
diff --git a/src/Makefile.am b/src/Makefile.am
index cc973b3..fd84dd0 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -107,6 +107,7 @@ EXTRA_DIST += \
 	$(srcdir)/.git_version \
 	$(srcdir)/ceph-rbdnamer \
 	$(srcdir)/tools/ceph-monstore-update-crush.sh \
+	$(srcdir)/script/subman \
 	$(srcdir)/upstart/ceph-all.conf \
 	$(srcdir)/upstart/ceph-disk.conf \
 	$(srcdir)/upstart/ceph-mon.conf \
@@ -152,6 +153,10 @@ doc_DATA = $(srcdir)/sample.ceph.conf sample.fetch_config
 ceph_libexecdir = $(libexecdir)/ceph
 ceph_libexec_SCRIPTS = ceph_common.sh ceph-osd-prestart.sh
 
+if ENABLE_SUBMAN
+submandir = /etc/cron.hourly
+subman_DATA = script/subman
+endif
 
 # tests to actually run on "make check"; if you need extra, non-test,
 # executables built, you need to replace this with manual assignments
diff --git a/src/Makefile.in b/src/Makefile.in
index cdf135e..1c8f334 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -585,7 +585,9 @@ check_PROGRAMS = $(am__EXEEXT_63) $(am__EXEEXT_64) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/MirroringWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Operations.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/parent_types.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/SnapInfo.h \
@@ -593,7 +595,9 @@ check_PROGRAMS = $(am__EXEEXT_63) $(am__EXEEXT_64) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Utils.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/WatchNotifyTypes.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/AcquireRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/Policy.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/ReleaseRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/StandardPolicy.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/CloseRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/OpenRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshParentRequest.h \
@@ -601,8 +605,12 @@ check_PROGRAMS = $(am__EXEEXT_63) $(am__EXEEXT_64) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/SetSnapRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image_watcher/Notifier.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image_watcher/NotifyLockOwner.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/Policy.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/Replay.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/StandardPolicy.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/Types.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/TypeTraits.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/mirroring_watcher/Types.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/InvalidateRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/LockRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/Request.h \
@@ -1012,6 +1020,7 @@ check_PROGRAMS = $(am__EXEEXT_63) $(am__EXEEXT_64) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_rbd_mirror_image_replay
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_227 = unittest_rbd_replay
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_228 = librbd_test.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test_mock.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_mirror_test.la
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_229 = unittest_librbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	unittest_rbd_mirror
@@ -1026,6 +1035,7 @@ check_PROGRAMS = $(am__EXEEXT_63) $(am__EXEEXT_64) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageState.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockJournal.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockJournalPolicy.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockObjectMap.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockOperations.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockReadahead.h \
@@ -1433,7 +1443,8 @@ am__installdirs = "$(DESTDIR)$(compressorlibdir)" \
 	"$(DESTDIR)$(docdir)" "$(DESTDIR)$(libcephfs_includedir)" \
 	"$(DESTDIR)$(librbd_includedir)" \
 	"$(DESTDIR)$(rados_includedir)" \
-	"$(DESTDIR)$(radosstriper_includedir)"
+	"$(DESTDIR)$(radosstriper_includedir)" \
+	"$(DESTDIR)$(submandir)"
 LTLIBRARIES = $(compressorlib_LTLIBRARIES) \
 	$(erasure_codelib_LTLIBRARIES) $(lib_LTLIBRARIES) \
 	$(noinst_LTLIBRARIES) $(radoslib_LTLIBRARIES)
@@ -2898,16 +2909,18 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 	librbd/ExclusiveLock.cc librbd/ImageCtx.cc \
 	librbd/ImageState.cc librbd/ImageWatcher.cc librbd/internal.cc \
 	librbd/Journal.cc librbd/LibrbdAdminSocketHook.cc \
-	librbd/LibrbdWriteback.cc librbd/ObjectMap.cc \
+	librbd/LibrbdWriteback.cc librbd/MirroringWatcher.cc \
+	librbd/ObjectMap.cc librbd/ObjectWatcher.cc \
 	librbd/Operations.cc librbd/Utils.cc \
 	librbd/exclusive_lock/AcquireRequest.cc \
 	librbd/exclusive_lock/ReleaseRequest.cc \
+	librbd/exclusive_lock/StandardPolicy.cc \
 	librbd/image/CloseRequest.cc librbd/image/OpenRequest.cc \
 	librbd/image/RefreshParentRequest.cc \
 	librbd/image/RefreshRequest.cc librbd/image/SetSnapRequest.cc \
 	librbd/image_watcher/Notifier.cc \
 	librbd/image_watcher/NotifyLockOwner.cc \
-	librbd/journal/Replay.cc \
+	librbd/journal/Replay.cc librbd/journal/StandardPolicy.cc \
 	librbd/object_map/InvalidateRequest.cc \
 	librbd/object_map/LockRequest.cc librbd/object_map/Request.cc \
 	librbd/object_map/RefreshRequest.cc \
@@ -2945,11 +2958,14 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/MirroringWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Operations.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Utils.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/AcquireRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/ReleaseRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/StandardPolicy.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/CloseRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/OpenRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshParentRequest.lo \
@@ -2958,6 +2974,7 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image_watcher/Notifier.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image_watcher/NotifyLockOwner.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/Replay.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/StandardPolicy.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/InvalidateRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/LockRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/Request.lo \
@@ -3071,7 +3088,9 @@ librbd_test_la_LIBADD =
 am__librbd_test_la_SOURCES_DIST = test/librbd/test_fixture.cc \
 	test/librbd/test_support.cc test/librbd/test_librbd.cc \
 	test/librbd/test_ImageWatcher.cc test/librbd/test_internal.cc \
-	test/librbd/test_mirroring.cc test/librbd/test_ObjectMap.cc \
+	test/librbd/test_mirroring.cc \
+	test/librbd/test_MirroringWatcher.cc \
+	test/librbd/test_ObjectMap.cc \
 	test/librbd/journal/test_Entries.cc \
 	test/librbd/journal/test_Replay.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_test_la_OBJECTS = test/librbd/librbd_test_la-test_fixture.lo \
@@ -3080,6 +3099,7 @@ am__librbd_test_la_SOURCES_DIST = test/librbd/test_fixture.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_ImageWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_internal.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_mirroring.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_MirroringWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_ObjectMap.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/journal/librbd_test_la-test_Entries.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/journal/librbd_test_la-test_Replay.lo
@@ -3089,6 +3109,16 @@ librbd_test_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(librbd_test_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_test_la_rpath =
+librbd_test_mock_la_LIBADD =
+am__librbd_test_mock_la_SOURCES_DIST =  \
+	test/librbd/mock/MockImageCtx.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_test_mock_la_OBJECTS = test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo
+librbd_test_mock_la_OBJECTS = $(am_librbd_test_mock_la_OBJECTS)
+librbd_test_mock_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(librbd_test_mock_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_test_mock_la_rpath =
 librbd_tp_la_DEPENDENCIES =
 am__librbd_tp_la_SOURCES_DIST = tracing/librbd.c
 @WITH_LTTNG_TRUE at am_librbd_tp_la_OBJECTS =  \
@@ -3102,7 +3132,7 @@ librbd_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 @WITH_LTTNG_TRUE at am_librbd_tp_la_rpath = -rpath $(libdir)
 librbd_types_la_LIBADD =
 am_librbd_types_la_OBJECTS = librbd/journal/Types.lo \
-	librbd/WatchNotifyTypes.lo
+	librbd/mirroring_watcher/Types.lo librbd/WatchNotifyTypes.lo
 librbd_types_la_OBJECTS = $(am_librbd_types_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_11 = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
@@ -4500,7 +4530,8 @@ ceph_test_rados_api_list_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_list_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_24) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_9)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_9) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
 ceph_test_rados_api_list_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_rados_api_list_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -6330,6 +6361,7 @@ am__unittest_librbd_SOURCES_DIST = test/librbd/test_main.cc \
 	test/librbd/test_mock_fixture.cc \
 	test/librbd/test_mock_ExclusiveLock.cc \
 	test/librbd/test_mock_Journal.cc \
+	test/librbd/test_mock_ObjectWatcher.cc \
 	test/librbd/exclusive_lock/test_mock_AcquireRequest.cc \
 	test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc \
 	test/librbd/image/test_mock_RefreshRequest.cc \
@@ -6353,6 +6385,7 @@ am__unittest_librbd_SOURCES_DIST = test/librbd/test_main.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_fixture.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_ExclusiveLock.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_Journal.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_ObjectWatcher.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/image/unittest_librbd-test_mock_RefreshRequest.$(OBJEXT) \
@@ -6374,6 +6407,7 @@ am__unittest_librbd_SOURCES_DIST = test/librbd/test_main.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.$(OBJEXT)
 unittest_librbd_OBJECTS = $(am_unittest_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_DEPENDENCIES = librbd_test.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test_mock.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
@@ -6559,6 +6593,7 @@ unittest_prioritized_queue_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__unittest_rbd_mirror_SOURCES_DIST = test/rbd_mirror/test_main.cc \
 	test/rbd_mirror/test_mock_fixture.cc \
+	test/rbd_mirror/test_mock_ImageReplayer.cc \
 	test/rbd_mirror/test_mock_ImageSync.cc \
 	test/rbd_mirror/image_sync/test_mock_ImageCopyRequest.cc \
 	test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc \
@@ -6568,6 +6603,7 @@ am__unittest_rbd_mirror_SOURCES_DIST = test/rbd_mirror/test_main.cc \
 	test/rbd_mirror/mock/MockJournaler.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_unittest_rbd_mirror_OBJECTS = test/rbd_mirror/unittest_rbd_mirror-test_main.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/unittest_rbd_mirror-test_mock_fixture.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageSync.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/image_sync/unittest_rbd_mirror-test_mock_ImageCopyRequest.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/image_sync/unittest_rbd_mirror-test_mock_ObjectCopyRequest.$(OBJEXT) \
@@ -6577,6 +6613,7 @@ am__unittest_rbd_mirror_SOURCES_DIST = test/rbd_mirror/test_main.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/mock/unittest_rbd_mirror-MockJournaler.$(OBJEXT)
 unittest_rbd_mirror_OBJECTS = $(am_unittest_rbd_mirror_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_rbd_mirror_DEPENDENCIES = librbd_mirror_test.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test_mock.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_mirror_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la \
@@ -7024,17 +7061,18 @@ SOURCES = $(libkv_a_SOURCES) $(libmon_a_SOURCES) $(libos_a_SOURCES) \
 	$(librbd_mirror_test_la_SOURCES) $(librbd_replay_la_SOURCES) \
 	$(librbd_replay_ios_la_SOURCES) \
 	$(librbd_replay_types_la_SOURCES) $(librbd_test_la_SOURCES) \
-	$(librbd_tp_la_SOURCES) $(nodist_librbd_tp_la_SOURCES) \
-	$(librbd_types_la_SOURCES) $(librgw_la_SOURCES) \
-	$(libsecret_la_SOURCES) $(libsystest_la_SOURCES) \
-	$(ceph_authtool_SOURCES) $(ceph_bluefs_tool_SOURCES) \
-	$(ceph_client_debug_SOURCES) $(ceph_conf_SOURCES) \
-	$(ceph_dencoder_SOURCES) $(ceph_fuse_SOURCES) \
-	$(ceph_kvstore_tool_SOURCES) $(ceph_mds_SOURCES) \
-	$(ceph_mon_SOURCES) $(ceph_monstore_tool_SOURCES) \
-	$(ceph_objectstore_tool_SOURCES) $(ceph_osd_SOURCES) \
-	$(ceph_osdomap_tool_SOURCES) $(ceph_syn_SOURCES) \
-	$(ceph_bench_log_SOURCES) $(ceph_erasure_code_SOURCES) \
+	$(librbd_test_mock_la_SOURCES) $(librbd_tp_la_SOURCES) \
+	$(nodist_librbd_tp_la_SOURCES) $(librbd_types_la_SOURCES) \
+	$(librgw_la_SOURCES) $(libsecret_la_SOURCES) \
+	$(libsystest_la_SOURCES) $(ceph_authtool_SOURCES) \
+	$(ceph_bluefs_tool_SOURCES) $(ceph_client_debug_SOURCES) \
+	$(ceph_conf_SOURCES) $(ceph_dencoder_SOURCES) \
+	$(ceph_fuse_SOURCES) $(ceph_kvstore_tool_SOURCES) \
+	$(ceph_mds_SOURCES) $(ceph_mon_SOURCES) \
+	$(ceph_monstore_tool_SOURCES) $(ceph_objectstore_tool_SOURCES) \
+	$(ceph_osd_SOURCES) $(ceph_osdomap_tool_SOURCES) \
+	$(ceph_syn_SOURCES) $(ceph_bench_log_SOURCES) \
+	$(ceph_erasure_code_SOURCES) \
 	$(ceph_erasure_code_benchmark_SOURCES) \
 	$(ceph_erasure_code_non_regression_SOURCES) \
 	$(ceph_kvstorebench_SOURCES) \
@@ -7291,6 +7329,7 @@ DIST_SOURCES = $(am__libkv_a_SOURCES_DIST) \
 	$(am__librbd_replay_ios_la_SOURCES_DIST) \
 	$(am__librbd_replay_types_la_SOURCES_DIST) \
 	$(am__librbd_test_la_SOURCES_DIST) \
+	$(am__librbd_test_mock_la_SOURCES_DIST) \
 	$(am__librbd_tp_la_SOURCES_DIST) $(librbd_types_la_SOURCES) \
 	$(am__librgw_la_SOURCES_DIST) $(libsecret_la_SOURCES) \
 	$(am__libsystest_la_SOURCES_DIST) $(ceph_authtool_SOURCES) \
@@ -7534,7 +7573,8 @@ am__pep3147_tweak = \
 py_compile = $(top_srcdir)/py-compile
 DATA = $(bash_completion_DATA) $(dist_noinst_DATA) $(doc_DATA) \
 	$(libcephfs_include_DATA) $(librbd_include_DATA) \
-	$(rados_include_DATA) $(radosstriper_include_DATA)
+	$(rados_include_DATA) $(radosstriper_include_DATA) \
+	$(subman_DATA)
 am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	auth/cephx/CephxAuthorizeHandler.h auth/cephx/CephxKeyServer.h \
 	auth/cephx/CephxProtocol.h auth/cephx/CephxClientHandler.h \
@@ -7836,17 +7876,23 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	librbd/DiffIterate.h librbd/ExclusiveLock.h librbd/ImageCtx.h \
 	librbd/ImageState.h librbd/ImageWatcher.h librbd/internal.h \
 	librbd/Journal.h librbd/LibrbdAdminSocketHook.h \
-	librbd/LibrbdWriteback.h librbd/ObjectMap.h \
-	librbd/Operations.h librbd/parent_types.h librbd/SnapInfo.h \
-	librbd/TaskFinisher.h librbd/Utils.h librbd/WatchNotifyTypes.h \
+	librbd/LibrbdWriteback.h librbd/MirroringWatcher.h \
+	librbd/ObjectMap.h librbd/ObjectWatcher.h librbd/Operations.h \
+	librbd/parent_types.h librbd/SnapInfo.h librbd/TaskFinisher.h \
+	librbd/Utils.h librbd/WatchNotifyTypes.h \
 	librbd/exclusive_lock/AcquireRequest.h \
+	librbd/exclusive_lock/Policy.h \
 	librbd/exclusive_lock/ReleaseRequest.h \
+	librbd/exclusive_lock/StandardPolicy.h \
 	librbd/image/CloseRequest.h librbd/image/OpenRequest.h \
 	librbd/image/RefreshParentRequest.h \
 	librbd/image/RefreshRequest.h librbd/image/SetSnapRequest.h \
 	librbd/image_watcher/Notifier.h \
-	librbd/image_watcher/NotifyLockOwner.h librbd/journal/Replay.h \
-	librbd/journal/Types.h librbd/object_map/InvalidateRequest.h \
+	librbd/image_watcher/NotifyLockOwner.h librbd/journal/Policy.h \
+	librbd/journal/Replay.h librbd/journal/StandardPolicy.h \
+	librbd/journal/Types.h librbd/journal/TypeTraits.h \
+	librbd/mirroring_watcher/Types.h \
+	librbd/object_map/InvalidateRequest.h \
 	librbd/object_map/LockRequest.h librbd/object_map/Request.h \
 	librbd/object_map/RefreshRequest.h \
 	librbd/object_map/ResizeRequest.h \
@@ -7953,6 +7999,7 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	test/librbd/mock/MockImageState.h \
 	test/librbd/mock/MockImageWatcher.h \
 	test/librbd/mock/MockJournal.h \
+	test/librbd/mock/MockJournalPolicy.h \
 	test/librbd/mock/MockObjectMap.h \
 	test/librbd/mock/MockOperations.h \
 	test/librbd/mock/MockReadahead.h \
@@ -9166,7 +9213,7 @@ EXTRA_DIST = $(am__append_31) ceph-detect-init/AUTHORS.rst \
 	$(srcdir)/make_version $(srcdir)/.git_version \
 	$(srcdir)/ceph-rbdnamer \
 	$(srcdir)/tools/ceph-monstore-update-crush.sh \
-	$(srcdir)/upstart/ceph-all.conf \
+	$(srcdir)/script/subman $(srcdir)/upstart/ceph-all.conf \
 	$(srcdir)/upstart/ceph-disk.conf \
 	$(srcdir)/upstart/ceph-mon.conf \
 	$(srcdir)/upstart/ceph-mon-all.conf \
@@ -9585,11 +9632,12 @@ check_SCRIPTS = ceph-detect-init/run-tox.sh ceph-disk/run-tox.sh \
 	test/mon/misc.sh test/mon/osd-crush.sh test/mon/mon-ping.sh \
 	test/mon/mon-created-time.sh \
 	test/mon/osd-erasure-code-profile.sh test/mon/mkfs.sh \
-	test/mon/mon-scrub.sh test/osd/osd-scrub-repair.sh \
-	test/osd/osd-scrub-snaps.sh test/osd/osd-config.sh \
-	test/osd/osd-reuse-id.sh test/osd/osd-bench.sh \
-	test/osd/osd-reactivate.sh test/osd/osd-copy-from.sh \
-	test/osd/osd-markdown.sh test/mon/mon-handle-forward.sh \
+	test/mon/mon-scrub.sh test/mon/test_pool_quota.sh \
+	test/osd/osd-scrub-repair.sh test/osd/osd-scrub-snaps.sh \
+	test/osd/osd-config.sh test/osd/osd-reuse-id.sh \
+	test/osd/osd-bench.sh test/osd/osd-reactivate.sh \
+	test/osd/osd-copy-from.sh test/osd/osd-markdown.sh \
+	test/mon/mon-handle-forward.sh \
 	test/libradosstriper/rados-striper.sh \
 	test/test_objectstore_memstore.sh test/test_pidfile.sh \
 	test/pybind/test_ceph_argparse.py \
@@ -9710,9 +9758,6 @@ DENCODER_SOURCES = $(am__append_47) perfglue/disabled_heap_profiler.cc \
 	perfglue/disabled_stubs.cc $(am__append_142)
 DENCODER_DEPS = $(am__append_48) $(am__append_137) $(am__append_143) \
 	$(am__append_159)
-
-# put virtualenvs in this directory for build
-CEPH_BUILD_VIRTUALENV = "/tmp/"
 radoslibdir = $(libdir)/rados-classes
 LOCAL_ALL = ceph-detect-init-all ceph-disk-all $(am__append_282) \
 	$(am__append_286) $(am__append_290)
@@ -10408,6 +10453,7 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 
 librbd_types_la_SOURCES = \
 	librbd/journal/Types.cc \
+	librbd/mirroring_watcher/Types.cc \
 	librbd/WatchNotifyTypes.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_internal_la_SOURCES = \
@@ -10428,11 +10474,14 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/MirroringWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Operations.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Utils.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/AcquireRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/ReleaseRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/StandardPolicy.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/CloseRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/OpenRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshParentRequest.cc \
@@ -10441,6 +10490,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image_watcher/Notifier.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image_watcher/NotifyLockOwner.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/Replay.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/journal/StandardPolicy.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/InvalidateRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/LockRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/Request.cc \
@@ -11357,7 +11407,7 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_aio_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_list_SOURCES = test/librados/list.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_list_LDADD = $(LIBRADOS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_list_LDADD = $(LIBRADOS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_list_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_nlist_SOURCES = test/librados/nlist.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_nlist_LDADD = $(LIBRADOS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
@@ -11448,16 +11498,22 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_ImageWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_internal.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mirroring.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_MirroringWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_ObjectMap.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/journal/test_Entries.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/journal/test_Replay.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_test_mock_la_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageCtx.cc
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_test_mock_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        test/librbd/test_main.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_ExclusiveLock.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_Journal.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_ObjectWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/test_mock_AcquireRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/image/test_mock_RefreshRequest.cc \
@@ -11480,7 +11536,8 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_LDADD = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test.la librbd_test_mock.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la librados_internal.la \
@@ -11518,6 +11575,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_rbd_mirror_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/test_main.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/test_mock_fixture.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/test_mock_ImageReplayer.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/test_mock_ImageSync.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/image_sync/test_mock_ImageCopyRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc \
@@ -11529,6 +11587,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_rbd_mirror_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_rbd_mirror_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_mirror_test.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test_mock.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_mirror_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la \
@@ -12343,11 +12402,13 @@ librbd_tp_la_LDFLAGS = -version-info 1:0:0
 libos_tp_la_LIBADD = -ldl -llttng-ust
 libos_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
 libos_tp_la_LDFLAGS = -version-info 1:0:0
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE at CYTHON_BUILD_DIR = "$(shell readlink -f $(builddir))/build"
 @ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE at PY_DISTUTILS = \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	mkdir -p $(CYTHON_BUILD_DIR); \
 @ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	CPPFLAGS="-iquote \${abs_srcdir}/include ${AM_CPPFLAGS} ${CPPFLAGS}" \
 @ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	CFLAGS="-iquote \${abs_srcdir}/include ${AM_CFLAGS} ${PYTHON_CFLAGS}" \
 @ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	LDFLAGS="-L\${abs_builddir}/.libs $(subst -pie,,${AM_LDFLAGS}) ${PYTHON_LDFLAGS}" \
- at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	CYTHON_BUILD_DIR="$(shell readlink -f $(builddir))/build" \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	CYTHON_BUILD_DIR=$(CYTHON_BUILD_DIR) \
 @ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@	${PYTHON} ./setup.py
 
 
@@ -12371,6 +12432,8 @@ doc_DATA = $(srcdir)/sample.ceph.conf sample.fetch_config
 # various scripts in $(libexecdir)
 ceph_libexecdir = $(libexecdir)/ceph
 ceph_libexec_SCRIPTS = ceph_common.sh ceph-osd-prestart.sh
+ at ENABLE_SUBMAN_TRUE@submandir = /etc/cron.hourly
+ at ENABLE_SUBMAN_TRUE@subman_DATA = script/subman
 
 # TODO: If we're running the parallel test harness (the preferred harness), this should be AM_TESTS_ENVIRONMENT instead.
 # See: https://www.gnu.org/software/automake/manual/html_node/Scripts_002dbased-Testsuites.html
@@ -14863,8 +14926,12 @@ librbd/LibrbdAdminSocketHook.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/LibrbdWriteback.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/MirroringWatcher.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ObjectMap.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/ObjectWatcher.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/Operations.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/Utils.lo: librbd/$(am__dirstamp) \
@@ -14881,6 +14948,9 @@ librbd/exclusive_lock/AcquireRequest.lo:  \
 librbd/exclusive_lock/ReleaseRequest.lo:  \
 	librbd/exclusive_lock/$(am__dirstamp) \
 	librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+librbd/exclusive_lock/StandardPolicy.lo:  \
+	librbd/exclusive_lock/$(am__dirstamp) \
+	librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
 librbd/image/$(am__dirstamp):
 	@$(MKDIR_P) librbd/image
 	@: > librbd/image/$(am__dirstamp)
@@ -14917,6 +14987,8 @@ librbd/journal/$(DEPDIR)/$(am__dirstamp):
 	@: > librbd/journal/$(DEPDIR)/$(am__dirstamp)
 librbd/journal/Replay.lo: librbd/journal/$(am__dirstamp) \
 	librbd/journal/$(DEPDIR)/$(am__dirstamp)
+librbd/journal/StandardPolicy.lo: librbd/journal/$(am__dirstamp) \
+	librbd/journal/$(DEPDIR)/$(am__dirstamp)
 librbd/object_map/$(am__dirstamp):
 	@$(MKDIR_P) librbd/object_map
 	@: > librbd/object_map/$(am__dirstamp)
@@ -15128,6 +15200,9 @@ test/librbd/librbd_test_la-test_internal.lo:  \
 test/librbd/librbd_test_la-test_mirroring.lo:  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/librbd_test_la-test_MirroringWatcher.lo:  \
+	test/librbd/$(am__dirstamp) \
+	test/librbd/$(DEPDIR)/$(am__dirstamp)
 test/librbd/librbd_test_la-test_ObjectMap.lo:  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
@@ -15146,6 +15221,18 @@ test/librbd/journal/librbd_test_la-test_Replay.lo:  \
 
 librbd_test.la: $(librbd_test_la_OBJECTS) $(librbd_test_la_DEPENDENCIES) $(EXTRA_librbd_test_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(librbd_test_la_LINK) $(am_librbd_test_la_rpath) $(librbd_test_la_OBJECTS) $(librbd_test_la_LIBADD) $(LIBS)
+test/librbd/mock/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/mock
+	@: > test/librbd/mock/$(am__dirstamp)
+test/librbd/mock/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/mock/$(DEPDIR)
+	@: > test/librbd/mock/$(DEPDIR)/$(am__dirstamp)
+test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo:  \
+	test/librbd/mock/$(am__dirstamp) \
+	test/librbd/mock/$(DEPDIR)/$(am__dirstamp)
+
+librbd_test_mock.la: $(librbd_test_mock_la_OBJECTS) $(librbd_test_mock_la_DEPENDENCIES) $(EXTRA_librbd_test_mock_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(librbd_test_mock_la_LINK) $(am_librbd_test_mock_la_rpath) $(librbd_test_mock_la_OBJECTS) $(librbd_test_mock_la_LIBADD) $(LIBS)
 tracing/librbd_tp_la-librbd.lo: tracing/$(am__dirstamp) \
 	tracing/$(DEPDIR)/$(am__dirstamp)
 
@@ -15153,6 +15240,15 @@ librbd_tp.la: $(librbd_tp_la_OBJECTS) $(librbd_tp_la_DEPENDENCIES) $(EXTRA_librb
 	$(AM_V_CCLD)$(librbd_tp_la_LINK) $(am_librbd_tp_la_rpath) $(librbd_tp_la_OBJECTS) $(librbd_tp_la_LIBADD) $(LIBS)
 librbd/journal/Types.lo: librbd/journal/$(am__dirstamp) \
 	librbd/journal/$(DEPDIR)/$(am__dirstamp)
+librbd/mirroring_watcher/$(am__dirstamp):
+	@$(MKDIR_P) librbd/mirroring_watcher
+	@: > librbd/mirroring_watcher/$(am__dirstamp)
+librbd/mirroring_watcher/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) librbd/mirroring_watcher/$(DEPDIR)
+	@: > librbd/mirroring_watcher/$(DEPDIR)/$(am__dirstamp)
+librbd/mirroring_watcher/Types.lo:  \
+	librbd/mirroring_watcher/$(am__dirstamp) \
+	librbd/mirroring_watcher/$(DEPDIR)/$(am__dirstamp)
 librbd/WatchNotifyTypes.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 
@@ -17891,6 +17987,9 @@ test/librbd/unittest_librbd-test_mock_ExclusiveLock.$(OBJEXT):  \
 test/librbd/unittest_librbd-test_mock_Journal.$(OBJEXT):  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/unittest_librbd-test_mock_ObjectWatcher.$(OBJEXT):  \
+	test/librbd/$(am__dirstamp) \
+	test/librbd/$(DEPDIR)/$(am__dirstamp)
 test/librbd/exclusive_lock/$(am__dirstamp):
 	@$(MKDIR_P) test/librbd/exclusive_lock
 	@: > test/librbd/exclusive_lock/$(am__dirstamp)
@@ -18105,6 +18204,9 @@ test/rbd_mirror/unittest_rbd_mirror-test_main.$(OBJEXT):  \
 test/rbd_mirror/unittest_rbd_mirror-test_mock_fixture.$(OBJEXT):  \
 	test/rbd_mirror/$(am__dirstamp) \
 	test/rbd_mirror/$(DEPDIR)/$(am__dirstamp)
+test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.$(OBJEXT):  \
+	test/rbd_mirror/$(am__dirstamp) \
+	test/rbd_mirror/$(DEPDIR)/$(am__dirstamp)
 test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageSync.$(OBJEXT):  \
 	test/rbd_mirror/$(am__dirstamp) \
 	test/rbd_mirror/$(DEPDIR)/$(am__dirstamp)
@@ -18682,6 +18784,8 @@ mostlyclean-compile:
 	-rm -f librbd/image_watcher/*.lo
 	-rm -f librbd/journal/*.$(OBJEXT)
 	-rm -f librbd/journal/*.lo
+	-rm -f librbd/mirroring_watcher/*.$(OBJEXT)
+	-rm -f librbd/mirroring_watcher/*.lo
 	-rm -f librbd/object_map/*.$(OBJEXT)
 	-rm -f librbd/object_map/*.lo
 	-rm -f librbd/operation/*.$(OBJEXT)
@@ -18756,6 +18860,8 @@ mostlyclean-compile:
 	-rm -f test/librbd/image/*.$(OBJEXT)
 	-rm -f test/librbd/journal/*.$(OBJEXT)
 	-rm -f test/librbd/journal/*.lo
+	-rm -f test/librbd/mock/*.$(OBJEXT)
+	-rm -f test/librbd/mock/*.lo
 	-rm -f test/librbd/object_map/*.$(OBJEXT)
 	-rm -f test/librbd/operation/*.$(OBJEXT)
 	-rm -f test/mds/*.$(OBJEXT)
@@ -19349,7 +19455,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/Journal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdAdminSocketHook.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdWriteback.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/MirroringWatcher.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ObjectMap.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ObjectWatcher.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/Operations.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/Utils.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/WatchNotifyTypes.Plo at am__quote@
@@ -19358,6 +19466,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/librbd_la-librbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/exclusive_lock/$(DEPDIR)/AcquireRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/exclusive_lock/$(DEPDIR)/ReleaseRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/exclusive_lock/$(DEPDIR)/StandardPolicy.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/CloseRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/OpenRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/RefreshParentRequest.Plo at am__quote@
@@ -19366,7 +19475,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/image_watcher/$(DEPDIR)/Notifier.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/image_watcher/$(DEPDIR)/NotifyLockOwner.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/journal/$(DEPDIR)/Replay.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/journal/$(DEPDIR)/StandardPolicy.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/journal/$(DEPDIR)/Types.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/mirroring_watcher/$(DEPDIR)/Types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/InvalidateRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/LockRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/RefreshRequest.Plo at am__quote@
@@ -19974,6 +20085,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/ceph_test_librbd_api-test_support.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/ceph_test_librbd_fsx-fsx.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_ImageWatcher.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_MirroringWatcher.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_fixture.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_internal.Plo at am__quote@
@@ -19983,6 +20095,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_main.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_Journal.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Po at am__quote@
@@ -19990,6 +20103,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/journal/$(DEPDIR)/librbd_test_la-test_Entries.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/journal/$(DEPDIR)/librbd_test_la-test_Replay.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/journal/$(DEPDIR)/unittest_librbd-test_mock_Replay.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/mock/$(DEPDIR)/librbd_test_mock_la-MockImageCtx.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Po at am__quote@
@@ -20060,6 +20174,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/$(DEPDIR)/librbd_mirror_test_la-test_PoolWatcher.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/$(DEPDIR)/librbd_mirror_test_la-test_fixture.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_main.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageSync.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_fixture.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/rbd_mirror/image_sync/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageCopyRequest.Po at am__quote@
@@ -23982,6 +24097,13 @@ test/librbd/librbd_test_la-test_mirroring.lo: test/librbd/test_mirroring.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/librbd_test_la-test_mirroring.lo `test -f 'test/librbd/test_mirroring.cc' || echo '$(srcdir)/'`test/librbd/test_mirroring.cc
 
+test/librbd/librbd_test_la-test_MirroringWatcher.lo: test/librbd/test_MirroringWatcher.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/librbd_test_la-test_MirroringWatcher.lo -MD -MP -MF test/librbd/$(DEPDIR)/librbd_test_la-test_MirroringWatcher.Tpo -c -o test/librbd/librbd_test_la-test_MirroringWatcher.lo `test -f 'test/librbd/test_MirroringWatcher.cc' || echo '$(srcdir)/'`test/l [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/librbd_test_la-test_MirroringWatcher.Tpo test/librbd/$(DEPDIR)/librbd_test_la-test_MirroringWatcher.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_MirroringWatcher.cc' object='test/librbd/librbd_test_la-test_MirroringWatcher.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/librbd_test_la-test_MirroringWatcher.lo `test -f 'test/librbd/test_MirroringWatcher.cc' || echo '$(srcdir)/'`test/librbd/test_MirroringWatcher.cc
+
 test/librbd/librbd_test_la-test_ObjectMap.lo: test/librbd/test_ObjectMap.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/librbd_test_la-test_ObjectMap.lo -MD -MP -MF test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Tpo -c -o test/librbd/librbd_test_la-test_ObjectMap.lo `test -f 'test/librbd/test_ObjectMap.cc' || echo '$(srcdir)/'`test/librbd/test_ObjectMap.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Tpo test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Plo
@@ -24003,6 +24125,13 @@ test/librbd/journal/librbd_test_la-test_Replay.lo: test/librbd/journal/test_Repl
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/journal/librbd_test_la-test_Replay.lo `test -f 'test/librbd/journal/test_Replay.cc' || echo '$(srcdir)/'`test/librbd/journal/test_Replay.cc
 
+test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo: test/librbd/mock/MockImageCtx.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_mock_la_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo -MD -MP -MF test/librbd/mock/$(DEPDIR)/librbd_test_mock_la-MockImageCtx.Tpo -c -o test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo `test -f 'test/librbd/mock/MockImageCtx.cc' || echo '$(srcdir)/'`te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/mock/$(DEPDIR)/librbd_test_mock_la-MockImageCtx.Tpo test/librbd/mock/$(DEPDIR)/librbd_test_mock_la-MockImageCtx.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/mock/MockImageCtx.cc' object='test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_mock_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/mock/librbd_test_mock_la-MockImageCtx.lo `test -f 'test/librbd/mock/MockImageCtx.cc' || echo '$(srcdir)/'`test/librbd/mock/MockImageCtx.cc
+
 rgw/librgw_la-rgw_acl.lo: rgw/rgw_acl.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -MT rgw/librgw_la-rgw_acl.lo -MD -MP -MF rgw/$(DEPDIR)/librgw_la-rgw_acl.Tpo -c -o rgw/librgw_la-rgw_acl.lo `test -f 'rgw/rgw_acl.cc' || echo '$(srcdir)/'`rgw/rgw_acl.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/librgw_la-rgw_acl.Tpo rgw/$(DEPDIR)/librgw_la-rgw_acl.Plo
@@ -28966,6 +29095,20 @@ test/librbd/unittest_librbd-test_mock_Journal.obj: test/librbd/test_mock_Journal
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_Journal.obj `if test -f 'test/librbd/test_mock_Journal.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_Journal.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_mock_Journal.cc'; fi`
 
+test/librbd/unittest_librbd-test_mock_ObjectWatcher.o: test/librbd/test_mock_ObjectWatcher.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/unittest_librbd-test_mock_ObjectWatcher.o -MD -MP -MF test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Tpo -c -o test/librbd/unittest_librbd-test_mock_ObjectWatcher.o `test -f 'test/librbd/test_mock_ObjectWatcher.cc' || echo '$(srcdir)/'`test/librbd/test_mock_ObjectWatcher.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Tpo test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_mock_ObjectWatcher.cc' object='test/librbd/unittest_librbd-test_mock_ObjectWatcher.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_ObjectWatcher.o `test -f 'test/librbd/test_mock_ObjectWatcher.cc' || echo '$(srcdir)/'`test/librbd/test_mock_ObjectWatcher.cc
+
+test/librbd/unittest_librbd-test_mock_ObjectWatcher.obj: test/librbd/test_mock_ObjectWatcher.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/unittest_librbd-test_mock_ObjectWatcher.obj -MD -MP -MF test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Tpo -c -o test/librbd/unittest_librbd-test_mock_ObjectWatcher.obj `if test -f 'test/librbd/test_mock_ObjectWatcher.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_ObjectWatcher.cc'; else $(CYGPATH_W) '$(srcdir)/te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Tpo test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ObjectWatcher.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_mock_ObjectWatcher.cc' object='test/librbd/unittest_librbd-test_mock_ObjectWatcher.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_ObjectWatcher.obj `if test -f 'test/librbd/test_mock_ObjectWatcher.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_ObjectWatcher.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_mock_ObjectWatcher.cc'; fi`
+
 test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o: test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o -MD -MP -MF test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Tpo -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o `test -f 'test/librbd/exclusive_lock/test_mock_AcquireRequest.cc' || echo '$(srcdir)/'`test/librbd/ex [...]
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Tpo test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Po
@@ -29498,6 +29641,20 @@ test/rbd_mirror/unittest_rbd_mirror-test_mock_fixture.obj: test/rbd_mirror/test_
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_mirror_CXXFLAGS) $(CXXFLAGS) -c -o test/rbd_mirror/unittest_rbd_mirror-test_mock_fixture.obj `if test -f 'test/rbd_mirror/test_mock_fixture.cc'; then $(CYGPATH_W) 'test/rbd_mirror/test_mock_fixture.cc'; else $(CYGPATH_W) '$(srcdir)/test/rbd_mirror/test_mock_fixture.cc'; fi`
 
+test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.o: test/rbd_mirror/test_mock_ImageReplayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_mirror_CXXFLAGS) $(CXXFLAGS) -MT test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.o -MD -MP -MF test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Tpo -c -o test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.o `test -f 'test/rbd_mirror/test_mock_ImageReplayer.cc' || echo '$(srcdir)/'`test/rbd_mirror/test_mock_ImageReplayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Tpo test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/rbd_mirror/test_mock_ImageReplayer.cc' object='test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_mirror_CXXFLAGS) $(CXXFLAGS) -c -o test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.o `test -f 'test/rbd_mirror/test_mock_ImageReplayer.cc' || echo '$(srcdir)/'`test/rbd_mirror/test_mock_ImageReplayer.cc
+
+test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.obj: test/rbd_mirror/test_mock_ImageReplayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_mirror_CXXFLAGS) $(CXXFLAGS) -MT test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.obj -MD -MP -MF test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Tpo -c -o test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.obj `if test -f 'test/rbd_mirror/test_mock_ImageReplayer.cc'; then $(CYGPATH_W) 'test/rbd_mirror/test_mock_ImageReplayer. [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Tpo test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageReplayer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/rbd_mirror/test_mock_ImageReplayer.cc' object='test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_mirror_CXXFLAGS) $(CXXFLAGS) -c -o test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageReplayer.obj `if test -f 'test/rbd_mirror/test_mock_ImageReplayer.cc'; then $(CYGPATH_W) 'test/rbd_mirror/test_mock_ImageReplayer.cc'; else $(CYGPATH_W) '$(srcdir)/test/rbd_mirror/test_mock_ImageReplayer.cc'; fi`
+
 test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageSync.o: test/rbd_mirror/test_mock_ImageSync.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_mirror_CXXFLAGS) $(CXXFLAGS) -MT test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageSync.o -MD -MP -MF test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageSync.Tpo -c -o test/rbd_mirror/unittest_rbd_mirror-test_mock_ImageSync.o `test -f 'test/rbd_mirror/test_mock_ImageSync.cc' || echo '$(srcdir)/'`test/rbd_mirror/test_mock_ImageSync.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageSync.Tpo test/rbd_mirror/$(DEPDIR)/unittest_rbd_mirror-test_mock_ImageSync.Po
@@ -30232,6 +30389,7 @@ clean-libtool:
 	-rm -rf librbd/image/.libs librbd/image/_libs
 	-rm -rf librbd/image_watcher/.libs librbd/image_watcher/_libs
 	-rm -rf librbd/journal/.libs librbd/journal/_libs
+	-rm -rf librbd/mirroring_watcher/.libs librbd/mirroring_watcher/_libs
 	-rm -rf librbd/object_map/.libs librbd/object_map/_libs
 	-rm -rf librbd/operation/.libs librbd/operation/_libs
 	-rm -rf log/.libs log/_libs
@@ -30253,6 +30411,7 @@ clean-libtool:
 	-rm -rf test/libradosstriper/.libs test/libradosstriper/_libs
 	-rm -rf test/librbd/.libs test/librbd/_libs
 	-rm -rf test/librbd/journal/.libs test/librbd/journal/_libs
+	-rm -rf test/librbd/mock/.libs test/librbd/mock/_libs
 	-rm -rf test/rbd_mirror/.libs test/rbd_mirror/_libs
 	-rm -rf test/system/.libs test/system/_libs
 	-rm -rf tools/rbd_mirror/.libs tools/rbd_mirror/_libs
@@ -30434,6 +30593,27 @@ uninstall-radosstriper_includeDATA:
 	@list='$(radosstriper_include_DATA)'; test -n "$(radosstriper_includedir)" || list=; \
 	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
 	dir='$(DESTDIR)$(radosstriper_includedir)'; $(am__uninstall_files_from_dir)
+install-submanDATA: $(subman_DATA)
+	@$(NORMAL_INSTALL)
+	@list='$(subman_DATA)'; test -n "$(submandir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(submandir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(submandir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(submandir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(submandir)" || exit $$?; \
+	done
+
+uninstall-submanDATA:
+	@$(NORMAL_UNINSTALL)
+	@list='$(subman_DATA)'; test -n "$(submandir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(submandir)'; $(am__uninstall_files_from_dir)
 
 # This directory's subdirectories are mostly independent; you can cd
 # into them and run 'make' without going through this Makefile.
@@ -31543,6 +31723,13 @@ test/mon/mon-scrub.sh.log: test/mon/mon-scrub.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/mon/test_pool_quota.sh.log: test/mon/test_pool_quota.sh
+	@p='test/mon/test_pool_quota.sh'; \
+	b='test/mon/test_pool_quota.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/osd/osd-scrub-repair.sh.log: test/osd/osd-scrub-repair.sh
 	@p='test/osd/osd-scrub-repair.sh'; \
 	b='test/osd/osd-scrub-repair.sh'; \
@@ -31739,7 +31926,7 @@ install-binPROGRAMS: install-libLTLIBRARIES
 
 installdirs: installdirs-recursive
 installdirs-am:
-	for dir in "$(DESTDIR)$(compressorlibdir)" "$(DESTDIR)$(erasure_codelibdir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(radoslibdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(ceph_libexecdir)" "$(DESTDIR)$(ceph_monstore_update_crushdir)" "$(DESTDIR)$(ceph_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(pythondir)" "$(DESTDIR)$(bash_completiondir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(lib [...]
+	for dir in "$(DESTDIR)$(compressorlibdir)" "$(DESTDIR)$(erasure_codelibdir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(radoslibdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(ceph_libexecdir)" "$(DESTDIR)$(ceph_monstore_update_crushdir)" "$(DESTDIR)$(ceph_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(pythondir)" "$(DESTDIR)$(bash_completiondir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(lib [...]
 	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
 	done
 install: $(BUILT_SOURCES)
@@ -31869,6 +32056,8 @@ distclean-generic:
 	-rm -f librbd/image_watcher/$(am__dirstamp)
 	-rm -f librbd/journal/$(DEPDIR)/$(am__dirstamp)
 	-rm -f librbd/journal/$(am__dirstamp)
+	-rm -f librbd/mirroring_watcher/$(DEPDIR)/$(am__dirstamp)
+	-rm -f librbd/mirroring_watcher/$(am__dirstamp)
 	-rm -f librbd/object_map/$(DEPDIR)/$(am__dirstamp)
 	-rm -f librbd/object_map/$(am__dirstamp)
 	-rm -f librbd/operation/$(DEPDIR)/$(am__dirstamp)
@@ -31975,6 +32164,8 @@ distclean-generic:
 	-rm -f test/librbd/image/$(am__dirstamp)
 	-rm -f test/librbd/journal/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/librbd/journal/$(am__dirstamp)
+	-rm -f test/librbd/mock/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/librbd/mock/$(am__dirstamp)
 	-rm -f test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/librbd/object_map/$(am__dirstamp)
 	-rm -f test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
@@ -32043,7 +32234,7 @@ clean-am: clean-binPROGRAMS clean-checkPROGRAMS \
 	clean-sbinPROGRAMS clean-su_sbinPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) compr [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) compr [...]
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-tags
@@ -32068,7 +32259,8 @@ install-data-am: install-bash_completionDATA \
 	install-libcephfs_includeDATA install-librbd_includeDATA \
 	install-pythonPYTHON install-rados_includeDATA \
 	install-radoslibLTLIBRARIES install-radosstriper_includeDATA \
-	install-su_sbinPROGRAMS install-su_sbinSCRIPTS
+	install-su_sbinPROGRAMS install-su_sbinSCRIPTS \
+	install-submanDATA
 
 install-dvi: install-dvi-recursive
 
@@ -32100,7 +32292,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) compr [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) compr [...]
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -32128,7 +32320,7 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	uninstall-rados_includeDATA uninstall-radoslibLTLIBRARIES \
 	uninstall-radosstriper_includeDATA uninstall-sbinPROGRAMS \
 	uninstall-sbinSCRIPTS uninstall-su_sbinPROGRAMS \
-	uninstall-su_sbinSCRIPTS
+	uninstall-su_sbinSCRIPTS uninstall-submanDATA
 
 .MAKE: $(am__recursive_targets) all check check-am install install-am \
 	install-strip
@@ -32160,8 +32352,8 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	install-rados_includeDATA install-radoslibLTLIBRARIES \
 	install-radosstriper_includeDATA install-sbinPROGRAMS \
 	install-sbinSCRIPTS install-strip install-su_sbinPROGRAMS \
-	install-su_sbinSCRIPTS installcheck installcheck-am \
-	installdirs installdirs-am maintainer-clean \
+	install-su_sbinSCRIPTS install-submanDATA installcheck \
+	installcheck-am installdirs installdirs-am maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-compile \
 	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
 	recheck tags tags-am uninstall uninstall-am \
@@ -32176,7 +32368,7 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	uninstall-rados_includeDATA uninstall-radoslibLTLIBRARIES \
 	uninstall-radosstriper_includeDATA uninstall-sbinPROGRAMS \
 	uninstall-sbinSCRIPTS uninstall-su_sbinPROGRAMS \
-	uninstall-su_sbinSCRIPTS
+	uninstall-su_sbinSCRIPTS uninstall-submanDATA
 
 
 # display the output of failed check_SCRIPTS after a failed make check
@@ -32194,9 +32386,13 @@ export PYTHONPATH=$(top_srcdir)/src/pybind
 @SOLARIS_TRUE@	AM_COMMON_CFLAGS += -Wno-unused-local-typedefs
 @CLANG_FALSE@	AM_CXXFLAGS += -Wstrict-null-sentinel
 
+# put virtualenvs in this directory
+# otherwise it may overflow #! 80 kernel limit
+export CEPH_BUILD_VIRTUALENV = /tmp
+
 @NO_GIT_VERSION_TRUE at export NO_VERSION="yes"
 
-export CEPH_DETECT_INIT_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}ceph-detect-init-virtualenv
+export CEPH_DETECT_INIT_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv
 
 ceph-detect-init-all: ${CEPH_DETECT_INIT_VIRTUALENV}
 
@@ -32218,7 +32414,7 @@ ceph-detect-init-install-data:
 	fi ; \
 	python setup.py install $$root $$options
 
-export CEPH_DISK_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}ceph-disk-virtualenv
+export CEPH_DISK_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv
 
 ceph-disk-all: ${CEPH_DISK_VIRTUALENV}
 
diff --git a/src/bash_completion/ceph b/src/bash_completion/ceph
index eef885d..beec700 100644
--- a/src/bash_completion/ceph
+++ b/src/bash_completion/ceph
@@ -1,73 +1,50 @@
 #
 # Ceph - scalable distributed file system
 #
-# Copyright (C) 2011 Wido den Hollander <wido at widodh.nl>
-#
 # This is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License version 2.1, as published by the Free Software
-# Foundation.  See file COPYING.
+# Foundation.
 #
 
 _ceph()
 {
-        local cur prev
-
-        COMPREPLY=()
-        cur="${COMP_WORDS[COMP_CWORD]}"
-        prev="${COMP_WORDS[COMP_CWORD-1]}"
-        prevprev="${COMP_WORDS[COMP_CWORD-2]}"
+    local options_noarg="-h --help -s --status -w --watch --watch-debug --watch-info --watch-sec --watch-warn --watch-error --version -v --verbose --concise"
+    local options_arg="-c --conf -i --in-file -o --out-file --id --user -n --name --cluster --admin-daemon --admin-socket -f --format --connect-timeout"
+    local cnt=${#COMP_WORDS[@]}
+    local cur=${COMP_WORDS[COMP_CWORD]}
+    local prev=${COMP_WORDS[COMP_CWORD-1]}
 
-        if [[ ${cur} == -* ]] ; then
-            COMPREPLY=( $(compgen -W "--conf -c --name --id -m --version -s --status -w --watch -o --out-file -i --in-file" -- ${cur}) )
-            return 0
-        fi
+    if [[ " -c --conf -i --in-file -o --out-file " =~ " ${prev} " ]]
+    then
+	#default autocomplete for options (file autocomplete)
+	compopt -o default
+	COMPREPLY=()
+	return 0
+    fi
+    if [[ "${cur:0:1}" == "-" ]] ;
+    then
+	COMPREPLY=( $(compgen -W "${options_noarg} ${options_arg}" -- $cur) )
+	return 0
+    fi
+    declare -A hint_args
+    for (( i=1 ; i<cnt ; i++ ))
+    do
+	#skip this word if it is option
+	if [[ " ${options_noarg} " =~ " ${COMP_WORDS[i]} " ]]
+	then 
+	    continue 
+	fi
+	#skip this word and next if it is option with arg
+	if [[ " ${options_arg} " =~ " ${COMP_WORDS[i]} " ]]
+	then 
+	    ((i++)); 
+	    continue
+	fi
+	hint_args[$((i-1))]="${COMP_WORDS[i]}"
+    done
 
-        case "${prev}" in
-            -o | --out-file | -i | --in-file | --conf | -c)
-                COMPREPLY=( $(compgen -f ${cur}) )
-                return 0
-                ;;
-            -m)
-                COMPREPLY=( $(compgen -A hostname ${cur}) )
-                return 0
-                ;;
-            auth)
-                COMPREPLY=( $(compgen -W "list add del print_key print-key export get get-key import get-or-create get-or-create-key" -- ${cur}) )
-                return 0
-                ;;
-            pg)
-                COMPREPLY=( $(compgen -W "stat dump dump_json dump_stuck force_create_pg getmap map send_pg_creates scrub deep-scrub repair" -- ${cur}) )
-                return 0
-                ;;
-            osd)
-                COMPREPLY=( $(compgen -W "stat pool dump getmaxosd tree getmap getcrushmap lspools reweight-by-utilization trash tier" -- ${cur}) )
-                return 0
-                ;;
-            mon)
-                COMPREPLY=( $(compgen -W "stat getmap add remove dump" -- ${cur}) )
-                return 0
-                ;;
-            mds)
-                COMPREPLY=( $(compgen -W "stat stat getmap dump compat" -- ${cur}) )
-                return 0
-                ;;
-            pool)
-                COMPREPLY=( $(compgen -W "create delete rename stats set set-quota get rmsnap mksnap" -- ${cur}) )
-                return 0
-                ;;
-            health)
-                COMPREPLY=( $(compgen -W "detail" -- ${cur}) )
-                return 0
-                ;;
-            tier)
-                COMPREPLY=( $(compgen -W "remove cache-mode" -- ${cur}) )
-                return 0
-                ;;
-            ceph)
-                COMPREPLY=( $(compgen -W "osd mon mds pg auth health df" -- ${cur}) )
-                return 0
-                ;;
-        esac
+    local IFS=$'\n'
+    COMPREPLY=( $(${COMP_WORDS[0]} --completion "${hint_args[@]}" 2>/dev/null) )
 }
 complete -F _ceph ceph
diff --git a/src/ceph-detect-init/Makefile.am b/src/ceph-detect-init/Makefile.am
index 0d199e7..8ddcb1e 100644
--- a/src/ceph-detect-init/Makefile.am
+++ b/src/ceph-detect-init/Makefile.am
@@ -53,7 +53,7 @@ EXTRA_DIST += \
 	ceph-detect-init/tests/test_all.py \
 	ceph-detect-init/tox.ini
 
-export CEPH_DETECT_INIT_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}ceph-detect-init-virtualenv
+export CEPH_DETECT_INIT_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv
 
 ceph-detect-init-all: ${CEPH_DETECT_INIT_VIRTUALENV}
 
diff --git a/src/ceph-detect-init/ceph_detect_init/debian/__init__.py b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py
new file mode 100644
index 0000000..73a7851
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py
@@ -0,0 +1,21 @@
+distro = None
+release = None
+codename = None
+
+
+def choose_init():
+    """Select a init system
+
+    Returns the name of a init system (upstart, sysvinit ...).
+    """
+    assert(distro and codename)
+    if distro.lower() in ('ubuntu', 'linuxmint'):
+        if codename >= 'vivid':
+            return 'systemd'
+        else:
+            return 'upstart'
+    if distro.lower() == 'debian':
+        if codename in ('squeeze', 'wheezy'):
+            return 'sysvinit'
+        else:
+            return 'systemd'
diff --git a/src/ceph-detect-init/run-tox.sh b/src/ceph-detect-init/run-tox.sh
index 1333264..675b593 100755
--- a/src/ceph-detect-init/run-tox.sh
+++ b/src/ceph-detect-init/run-tox.sh
@@ -17,6 +17,11 @@
 # GNU Library Public License for more details.
 #
 
+if [ x"`uname`"x = xFreeBSDx ]; then
+    echo FreeBSD init system has not been integrated.
+    exit 0
+fi
+
 # run from the ceph-detect-init directory or from its parent
 : ${CEPH_DETECT_INIT_VIRTUALENV:=ceph-detect-init-virtualenv}
 test -d ceph-detect-init && cd ceph-detect-init
diff --git a/src/ceph-disk/Makefile.am b/src/ceph-disk/Makefile.am
index 952a486..9006303 100644
--- a/src/ceph-disk/Makefile.am
+++ b/src/ceph-disk/Makefile.am
@@ -29,7 +29,7 @@ EXTRA_DIST += \
 	ceph-disk/tests/test_main.py \
 	ceph-disk/tox.ini
 
-export CEPH_DISK_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}ceph-disk-virtualenv
+export CEPH_DISK_VIRTUALENV = ${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv
 
 ceph-disk-all: ${CEPH_DISK_VIRTUALENV}
 
diff --git a/src/ceph.in b/src/ceph.in
index 79c2263..ecc3eca 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -72,41 +72,45 @@ def get_pythonlib_dir():
     f = "lib.{platform}-{version[0]}.{version[1]}"
     name = f.format(platform=sysconfig.get_platform(),
                     version=sys.version_info)
-    return os.path.join('build', name)
+    return name
 
 if MYDIR.endswith('src') and \
    os.path.exists(os.path.join(MYDIR, '.libs')) and \
    os.path.exists(os.path.join(MYDIR, 'pybind')) and \
    os.path.exists(os.path.join(MYDIR, 'build')):
 
-    pythonlib_path = get_pythonlib_dir()
-    respawn_in_path(os.path.join(MYDIR, '.libs'), "pybind", pythonlib_path)
+    python_libpath = os.path.join(MYDIR, 'build', get_pythonlib_dir())
+    respawn_in_path(os.path.join(MYDIR, '.libs'), 'pybind', python_libpath)
     if os.environ.has_key('PATH') and MYDIR not in os.environ['PATH']:
         os.environ['PATH'] += ':' + MYDIR
 
 elif os.path.exists(os.path.join(os.getcwd(), "CMakeCache.txt")) \
-     and os.path.exists(os.path.join(os.getcwd(), "init-ceph")):
+     and os.path.exists(os.path.join(os.getcwd(), "bin/init-ceph")):
     src_path = None
     for l in open("./CMakeCache.txt").readlines():
         if l.startswith("Ceph_SOURCE_DIR:STATIC="):
             src_path = l.split("=")[1].strip()
 
+
     if src_path is None:
         # Huh, maybe we're not really in a cmake environment?
         pass
     else:
         # Developer mode, but in a cmake build dir instead of the src dir
-        lib_path = os.path.join(os.getcwd(), "src")
+        lib_path = os.path.join(os.getcwd(), "lib")
+        bin_path = os.path.join(os.getcwd(), "bin")
         pybind_path = os.path.join(src_path, "src", "pybind")
-        pythonlib_path = os.path.join(src_path, "src", get_pythonlib_dir())
-        respawn_in_path(lib_path, pybind_path, pythonlib_path)
 
-        sys.path.insert(0, os.path.join(MYDIR, pybind_path))
-        sys.path.insert(0, os.path.join(MYDIR, pythonlib_path))
+        import sysconfig
+        f = "lib.{platform}-{version[0]}.{version[1]}"
+        name = f.format(platform=sysconfig.get_platform(),
+                        version=sys.version_info)
+        pythonlib_path = os.path.join(os.getcwd(), "lib/cython_modules", name)
+
+        respawn_in_path(lib_path, pybind_path, pythonlib_path)
 
-    # Add src/ to path for e.g. ceph-conf
-    if os.environ.has_key('PATH') and lib_path not in os.environ['PATH']:
-        os.environ['PATH'] += ':' + lib_path
+        if os.environ.has_key('PATH') and bin_path not in os.environ['PATH']:
+            os.environ['PATH'] += ':' + bin_path
 
 import argparse
 import errno
@@ -474,6 +478,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
     return json_command(cluster_handle, target=target, argdict=valid_dict,
                         inbuf=inbuf)
 
+
 def complete(sigdict, args, target):
     """
     Command completion.  Match as much of [args] as possible,
@@ -491,46 +496,38 @@ def complete(sigdict, args, target):
         args = args[2:]
     # look for best match, accumulate possibles in bestcmds
     # (so we can maybe give a more-useful error message)
-    best_match_cnt = 0
-    bestcmds = []
+
+    match_count = 0
+    comps = []
     for cmdtag, cmd in sigdict.iteritems():
         sig = cmd['sig']
-        matched = matchnum(args, sig, partial=True)
-        if (matched > best_match_cnt):
-            if complete_verbose:
-                print("better match: {0} > {1}: {2}:{3} ".format(matched,
-                        best_match_cnt, cmdtag, concise_sig(sig)),  file=sys.stderr)
-            best_match_cnt = matched
-            bestcmds = [{cmdtag:cmd}]
-        elif matched == best_match_cnt:
-            if complete_verbose:
-                print("equal match: {0} > {1}: {2}:{3} ".format(matched,
-                        best_match_cnt, cmdtag, concise_sig(sig)), file=sys.stderr)
-            bestcmds.append({cmdtag:cmd})
-
-    # look through all matching sigs
-    comps = []
-    for cmddict in bestcmds:
-        for cmd in cmddict.itervalues():
-            sig = cmd['sig']
-            # either:
-            #   we match everything fully, so we want the next desc, or
-            #   we match more partially, so we want the partial match
-            fullindex = matchnum(args, sig, partial=False) - 1
-            partindex = matchnum(args, sig, partial=True) - 1
-            if complete_verbose:
-                print('{}: f {} p {} len {}'.format(sig, fullindex, partindex, len(sig)), file=sys.stderr)
-            if fullindex == partindex and fullindex + 1 < len(sig):
-                d = sig[fullindex + 1]
-            else:
-                d = sig[partindex]
-            comps.append(str(d))
-    if complete_verbose:
-        print('\n'.join(comps), file=sys.stderr)
-    print('\n'.join(comps))
+        j = 0
+        # iterate over all arguments, except last one
+        for arg in args[0:-1]:
+            if j > len(sig)-1:
+                # an out of argument definitions
+                break
+            found_match = arg in sig[j].complete(arg)
+            if not found_match and sig[j].req:
+                # no elements that match
+                break
+            if not sig[j].N:
+                j += 1
+        else:
+            # successfully matched all - except last one - arguments
+            if j < len(sig) and len(args) > 0:
+                comps += sig[j].complete(args[-1])
+
+            match_count += 1
+            match_cmd = cmd
 
+    if match_count == 1 and len(comps) == 0:
+        # only one command matched and no hints yet => add help
+        comps = comps + [' ', '#'+match_cmd['help']]
+    print('\n'.join(sorted(set(comps))))
     return 0
 
+
 ###
 # ping a monitor
 ###
@@ -712,9 +709,6 @@ def main():
             file=sys.stderr)
         return 1
 
-    if childargs in [['mon'], ['osd']]:
-        parsed_args.help = True
-
     if parsed_args.help:
         # short default timeout for -h
         if not timeout:
@@ -727,7 +721,9 @@ def main():
         if len(childargs) < 2:
             print('"ping" requires a monitor name as argument: "ping mon.<id>"', file=sys.stderr)
             return 1
-
+    if parsed_args.completion:
+        #for completion let timeout be really small
+        timeout = 3
     try:
         if childargs and childargs[0] == 'ping':
             return ping_monitor(cluster_handle, childargs[1], timeout)
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 7530e3d..177a61e 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -129,6 +129,8 @@ using namespace std;
 #define O_DIRECT 0x0
 #endif
 
+#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
+
 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 {
   Client *client = static_cast<Client*>(p);
@@ -1252,6 +1254,18 @@ Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
   Inode *in = 0;
   if (reply->head.is_target) {
     ist.decode(p, features);
+    if (cct->_conf->client_debug_getattr_caps) {
+      int op = request->get_op();
+      unsigned wanted = 0;
+      if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
+	wanted = request->head.args.getattr.mask;
+      else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_OPEN)
+	wanted = request->head.args.open.mask;
+
+      if ((wanted & CEPH_CAP_XATTR_SHARED) &&
+	  !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
+	  assert(0 == "MDS reply does not contain xattrs");
+    }
 
     in = add_update_inode(&ist, request->sent_stamp, session);
   }
@@ -2438,7 +2452,7 @@ void Client::handle_fs_map(MFSMap *m)
 {
   delete fsmap;
   fsmap = new FSMap;
-  fsmap->decode(m->get_encoded());
+  *fsmap = m->get_fsmap();
   m->put();
 
   signal_cond_list(waiting_for_fsmap);
@@ -5740,7 +5754,11 @@ int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target,
   path.push_dentry(name);
   req->set_filepath(path);
   req->set_inode(dir);
-  req->head.args.getattr.mask = 0;
+  if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
+      req->head.args.getattr.mask = DEBUG_GETATTR_CAPS;
+  else
+      req->head.args.getattr.mask = 0;
+
   ldout(cct, 10) << "_do_lookup on " << path << dendl;
 
   int r = make_request(req, uid, gid, target);
@@ -7703,6 +7721,10 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid)
     req->head.args.open.flags = flags & ~O_CREAT;
     req->head.args.open.mode = mode;
     req->head.args.open.pool = -1;
+    if (cct->_conf->client_debug_getattr_caps)
+      req->head.args.open.mask = DEBUG_GETATTR_CAPS;
+    else
+      req->head.args.open.mask = 0;
     req->head.args.open.old_size = in->size;   // for O_TRUNC
     req->set_inode(in);
     result = make_request(req, uid, gid);
@@ -7966,7 +7988,7 @@ retry:
   }
 
   if (!conf->client_debug_force_sync_read &&
-      (cct->_conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
+      (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
 
     if (f->flags & O_RSYNC) {
       _flush_range(in, offset, size);
@@ -8666,7 +8688,7 @@ int Client::fstat(int fd, struct stat *stbuf)
 
 // not written yet, but i want to link!
 
-int Client::chdir(const char *relpath)
+int Client::chdir(const char *relpath, std::string &new_cwd)
 {
   Mutex::Locker lock(client_lock);
   tout(cct) << "chdir" << std::endl;
@@ -8679,6 +8701,8 @@ int Client::chdir(const char *relpath)
   if (cwd != in)
     cwd.swap(in);
   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
+
+  getcwd(new_cwd);
   return 0;
 }
 
@@ -8690,7 +8714,15 @@ void Client::getcwd(string& dir)
   Inode *in = cwd.get();
   while (in != root) {
     assert(in->dn_set.size() < 2); // dirs can't be hard-linked
+
+    // A cwd or ancester is unlinked
+    if (in->dn_set.empty()) {
+      return;
+    }
+
     Dentry *dn = in->get_first_parent();
+
+
     if (!dn) {
       // look it up
       ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
@@ -10392,6 +10424,10 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
   req->head.args.open.stripe_unit = stripe_unit;
   req->head.args.open.stripe_count = stripe_count;
   req->head.args.open.object_size = object_size;
+  if (cct->_conf->client_debug_getattr_caps)
+    req->head.args.open.mask = DEBUG_GETATTR_CAPS;
+  else
+    req->head.args.open.mask = 0;
   req->head.args.open.pool = pool_id;
   req->dentry_drop = CEPH_CAP_FILE_SHARED;
   req->dentry_unless = CEPH_CAP_FILE_EXCL;
diff --git a/src/client/Client.h b/src/client/Client.h
index 67b5c5e..d53ca1d 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -892,7 +892,7 @@ public:
   int statfs(const char *path, struct statvfs *stbuf);
 
   // crap
-  int chdir(const char *s);
+  int chdir(const char *s, std::string &new_cwd);
   void getcwd(std::string& cwd);
 
   // namespace ops
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index 78fd7ce..b2ef93a 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -1206,7 +1206,10 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only)
       client->fsync(fd, b);
     } else if (strcmp(op, "chdir") == 0) {
       const char *a = t.get_string(buf, p);
-      client->chdir(a);
+      // Client users should remember their path, but since this
+      // is just a synthetic client we ignore it.
+      std::string ignore;
+      client->chdir(a, ignore);
     } else if (strcmp(op, "statfs") == 0) {
       struct statvfs stbuf;
       client->statfs("/", &stbuf);
diff --git a/src/cls/journal/cls_journal.cc b/src/cls/journal/cls_journal.cc
index 472b100..6e7d9d5 100644
--- a/src/cls/journal/cls_journal.cc
+++ b/src/cls/journal/cls_journal.cc
@@ -250,9 +250,11 @@ int journal_create(cls_method_context_t hctx, bufferlist *in, bufferlist *out) {
 
   bufferlist stored_orderbl;
   int r = cls_cxx_map_get_val(hctx, HEADER_KEY_ORDER, &stored_orderbl);
-  if (r != -ENOENT) {
+  if (r >= 0) {
     CLS_ERR("journal already exists");
     return -EEXIST;
+  } else if (r != -ENOENT) {
+    return r;
   }
 
   r = write_key(hctx, HEADER_KEY_ORDER, order);
@@ -547,12 +549,20 @@ int journal_client_register(cls_method_context_t hctx, bufferlist *in,
     return -EINVAL;
   }
 
+  uint8_t order;
+  int r = read_key(hctx, HEADER_KEY_ORDER, &order);
+  if (r < 0) {
+    return r;
+  }
+
   std::string key(key_from_client_id(id));
   bufferlist stored_clientbl;
-  int r = cls_cxx_map_get_val(hctx, key, &stored_clientbl);
-  if (r != -ENOENT) {
+  r = cls_cxx_map_get_val(hctx, key, &stored_clientbl);
+  if (r >= 0) {
     CLS_ERR("duplicate client id: %s", id.c_str());
     return -EEXIST;
+  } else if (r != -ENOENT) {
+    return r;
   }
 
   cls::journal::Client client(id, data);
@@ -860,9 +870,11 @@ int journal_tag_create(cls_method_context_t hctx, bufferlist *in,
   std::string key(key_from_tag_tid(tag_tid));
   bufferlist stored_tag_bl;
   int r = cls_cxx_map_get_val(hctx, key, &stored_tag_bl);
-  if (r != -ENOENT) {
+  if (r >= 0) {
     CLS_ERR("duplicate tag id: %" PRIu64, tag_tid);
     return -EEXIST;
+  } else if (r != -ENOENT) {
+    return r;
   }
 
   // verify tag tid ordering
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 036ad8a..ad7d6b1 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -121,6 +121,7 @@ cls_method_handle_t h_mirror_peer_remove;
 cls_method_handle_t h_mirror_peer_set_client;
 cls_method_handle_t h_mirror_peer_set_cluster;
 cls_method_handle_t h_mirror_image_list;
+cls_method_handle_t h_mirror_image_get_image_id;
 cls_method_handle_t h_mirror_image_get;
 cls_method_handle_t h_mirror_image_set;
 cls_method_handle_t h_mirror_image_remove;
@@ -2331,7 +2332,8 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
 
   BitVector<2> object_map;
   bufferlist header_bl;
-  r = cls_cxx_read(hctx, 0, object_map.get_header_length(), &header_bl);
+  r = cls_cxx_read2(hctx, 0, object_map.get_header_length(), &header_bl,
+                    CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
   if (r < 0) {
     CLS_ERR("object map header read failed");
     return r;
@@ -2346,8 +2348,9 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   }
 
   bufferlist footer_bl;
-  r = cls_cxx_read(hctx, object_map.get_footer_offset(),
-		   size - object_map.get_footer_offset(), &footer_bl);
+  r = cls_cxx_read2(hctx, object_map.get_footer_offset(),
+		    size - object_map.get_footer_offset(), &footer_bl,
+                    CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
   if (r < 0) {
     CLS_ERR("object map footer read failed");
     return r;
@@ -2371,8 +2374,8 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
 			      &byte_offset, &byte_length);
 
   bufferlist data_bl;
-  r = cls_cxx_read(hctx, object_map.get_header_length() + byte_offset,
-		   byte_length, &data_bl); 
+  r = cls_cxx_read2(hctx, object_map.get_header_length() + byte_offset,
+		    byte_length, &data_bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
   if (r < 0) {
     CLS_ERR("object map data read failed");
     return r;
@@ -2385,7 +2388,7 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
     CLS_ERR("failed to decode data chunk [%" PRIu64 "]: %s",
 	    byte_offset, err.what());
     return -EINVAL;
-  } 
+  }
 
   bool updated = false;
   for (uint64_t object_no = start_object_no; object_no < end_object_no;
@@ -2406,21 +2409,22 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
 
     bufferlist data_bl;
     object_map.encode_data(data_bl, byte_offset, byte_length);
-    r = cls_cxx_write(hctx, object_map.get_header_length() + byte_offset,
-		      data_bl.length(), &data_bl);
+    r = cls_cxx_write2(hctx, object_map.get_header_length() + byte_offset,
+		       data_bl.length(), &data_bl,
+                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
     if (r < 0) {
-      CLS_ERR("failed to write object map header: %s", cpp_strerror(r).c_str());  
-      return r;         
+      CLS_ERR("failed to write object map header: %s", cpp_strerror(r).c_str());
+      return r;
     }
-   
+
     footer_bl.clear();
     object_map.encode_footer(footer_bl);
-    r = cls_cxx_write(hctx, object_map.get_footer_offset(), footer_bl.length(),
-		      &footer_bl);
+    r = cls_cxx_write2(hctx, object_map.get_footer_offset(), footer_bl.length(),
+		       &footer_bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
     if (r < 0) {
-      CLS_ERR("failed to write object map footer: %s", cpp_strerror(r).c_str());  
+      CLS_ERR("failed to write object map footer: %s", cpp_strerror(r).c_str());
       return r;
-    } 
+    }
   } else {
     CLS_LOG(20, "object_map_update: no update necessary");
   }
@@ -2960,6 +2964,7 @@ static const std::string UUID("mirror_uuid");
 static const std::string MODE("mirror_mode");
 static const std::string PEER_KEY_PREFIX("mirror_peer_");
 static const std::string IMAGE_KEY_PREFIX("image_");
+static const std::string GLOBAL_KEY_PREFIX("global_");
 
 std::string peer_key(const std::string &uuid) {
   return PEER_KEY_PREFIX + uuid;
@@ -2969,6 +2974,10 @@ std::string image_key(const string &image_id) {
   return IMAGE_KEY_PREFIX + image_id;
 }
 
+std::string global_key(const string &global_id) {
+  return GLOBAL_KEY_PREFIX + global_id;
+}
+
 int uuid_get(cls_method_context_t hctx, std::string *mirror_uuid) {
   bufferlist mirror_uuid_bl;
   int r = cls_cxx_map_get_val(hctx, mirror::UUID, &mirror_uuid_bl);
@@ -3008,6 +3017,10 @@ int read_peers(cls_method_context_t hctx,
 	return -EIO;
       }
     }
+
+    if (!vals.empty()) {
+      last_read = vals.rbegin()->first;
+    }
   }
   return 0;
 }
@@ -3046,28 +3059,6 @@ int write_peer(cls_method_context_t hctx, const std::string &id,
   return 0;
 }
 
-int image_list_ids(cls_method_context_t hctx, vector<string> *image_ids) {
-  string last_read = IMAGE_KEY_PREFIX;
-  int max_read = RBD_MAX_KEYS_READ;
-  int r = max_read;
-  while (r == max_read) {
-    set<string> keys;
-    r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys);
-    if (r < 0) {
-      CLS_ERR("error reading mirrored images: %s", cpp_strerror(r).c_str());
-      return r;
-    }
-
-    for (auto &image_key : keys) {
-      if (0 != image_key.compare(0, IMAGE_KEY_PREFIX.size(), IMAGE_KEY_PREFIX)) {
-	return 0;
-      }
-      image_ids->push_back(image_key.substr(IMAGE_KEY_PREFIX.size()));
-    }
-  }
-  return 0;
-}
-
 int image_get(cls_method_context_t hctx, const string &image_id,
 	      cls::rbd::MirrorImage *mirror_image) {
   bufferlist bl;
@@ -3096,19 +3087,28 @@ int image_set(cls_method_context_t hctx, const string &image_id,
   bufferlist bl;
   ::encode(mirror_image, bl);
 
-  // don't overwrite the key if it already exists with a different
-  // global_image_id
   cls::rbd::MirrorImage existing_mirror_image;
   int r = image_get(hctx, image_id, &existing_mirror_image);
-  if (r < 0 && r != -ENOENT) {
+  if (r == -ENOENT) {
+    // make sure global id doesn't already exist
+    std::string global_id_key = global_key(mirror_image.global_image_id);
+    std::string image_id;
+    r = read_key(hctx, global_id_key, &image_id);
+    if (r >= 0) {
+      return -EEXIST;
+    } else if (r != -ENOENT) {
+      CLS_ERR("error reading global image id: '%s': '%s'", image_id.c_str(),
+              cpp_strerror(r).c_str());
+      return r;
+    }
+  } else if (r < 0) {
     CLS_ERR("error reading mirrored image '%s': '%s'", image_id.c_str(),
 	    cpp_strerror(r).c_str());
     return r;
-  }
-
-  if (r != -ENOENT &&
-      existing_mirror_image.global_image_id != mirror_image.global_image_id) {
-    return -EEXIST;
+  } else if (existing_mirror_image.global_image_id !=
+                mirror_image.global_image_id) {
+    // cannot change the global id
+    return -EINVAL;
   }
 
   r = cls_cxx_map_set_val(hctx, image_key(image_id), &bl);
@@ -3117,6 +3117,16 @@ int image_set(cls_method_context_t hctx, const string &image_id,
             cpp_strerror(r).c_str());
     return r;
   }
+
+  bufferlist image_id_bl;
+  ::encode(image_id, image_id_bl);
+  r = cls_cxx_map_set_val(hctx, global_key(mirror_image.global_image_id),
+                          &image_id_bl);
+  if (r < 0) {
+    CLS_ERR("error adding global id for image '%s': %s", image_id.c_str(),
+            cpp_strerror(r).c_str());
+    return r;
+  }
   return 0;
 }
 
@@ -3142,6 +3152,13 @@ int image_remove(cls_method_context_t hctx, const string &image_id) {
             cpp_strerror(r).c_str());
     return r;
   }
+
+  r = cls_cxx_map_remove_key(hctx, global_key(mirror_image.global_image_id));
+  if (r < 0 && r != -ENOENT) {
+    CLS_ERR("error removing global id for image '%s': %s", image_id.c_str(),
+           cpp_strerror(r).c_str());
+    return r;
+  }
   return 0;
 }
 
@@ -3486,21 +3503,96 @@ int mirror_peer_set_cluster(cls_method_context_t hctx, bufferlist *in,
 
 /**
  * Input:
- * none
+ * @param start_after which name to begin listing after
+ *        (use the empty string to start at the beginning)
+ * @param max_return the maximum number of names to list
  *
  * Output:
- * @param std::vector<std::string>: collection of image_ids
+ * @param std::map<std::string, std::string>: local id to global id map
  * @returns 0 on success, negative error code on failure
  */
 int mirror_image_list(cls_method_context_t hctx, bufferlist *in,
 		     bufferlist *out) {
-  vector<string> image_ids;
-  int r = mirror::image_list_ids(hctx, &image_ids);
+  std::string start_after;
+  uint64_t max_return;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(start_after, iter);
+    ::decode(max_return, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  int max_read = RBD_MAX_KEYS_READ;
+  int r = max_read;
+  std::map<std::string, std::string> mirror_images;
+  std::string last_read = mirror::image_key(start_after);
+
+  while (r == max_read && mirror_images.size() < max_return) {
+    std::map<std::string, bufferlist> vals;
+    CLS_LOG(20, "last_read = '%s'", last_read.c_str());
+    r = cls_cxx_map_get_vals(hctx, last_read, mirror::IMAGE_KEY_PREFIX,
+			     max_read, &vals);
+    if (r < 0) {
+      CLS_ERR("error reading mirror image directory by name: %s",
+              cpp_strerror(r).c_str());
+      return r;
+    }
+
+    for (auto it = vals.begin(); it != vals.end(); ++it) {
+      const std::string &image_id =
+        it->first.substr(mirror::IMAGE_KEY_PREFIX.size());
+      cls::rbd::MirrorImage mirror_image;
+      bufferlist::iterator iter = it->second.begin();
+      try {
+	::decode(mirror_image, iter);
+      } catch (const buffer::error &err) {
+	CLS_ERR("could not decode mirror image payload of image '%s'",
+                image_id.c_str());
+	return -EIO;
+      }
+
+      mirror_images[image_id] = mirror_image.global_image_id;
+      if (mirror_images.size() >= max_return) {
+	break;
+      }
+    }
+    if (!vals.empty()) {
+      last_read = mirror::image_key(mirror_images.rbegin()->first);
+    }
+  }
+
+  ::encode(mirror_images, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param global_id (std::string)
+ *
+ * Output:
+ * @param std::string - image id
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_image_get_image_id(cls_method_context_t hctx, bufferlist *in,
+                              bufferlist *out) {
+  std::string global_id;
+  try {
+    bufferlist::iterator it = in->begin();
+    ::decode(global_id, it);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  std::string image_id;
+  int r = read_key(hctx, mirror::global_key(global_id), &image_id);
   if (r < 0) {
+    CLS_ERR("error retrieving image id for global id '%s': %s",
+            global_id.c_str(), cpp_strerror(r).c_str());
     return r;
   }
 
-  ::encode(image_ids, *out);
+  ::encode(image_id, *out);
   return 0;
 }
 
@@ -3768,6 +3860,9 @@ void __cls_init()
                           mirror_peer_set_cluster, &h_mirror_peer_set_cluster);
   cls_register_cxx_method(h_class, "mirror_image_list", CLS_METHOD_RD,
                           mirror_image_list, &h_mirror_image_list);
+  cls_register_cxx_method(h_class, "mirror_image_get_image_id", CLS_METHOD_RD,
+                          mirror_image_get_image_id,
+                          &h_mirror_image_get_image_id);
   cls_register_cxx_method(h_class, "mirror_image_get", CLS_METHOD_RD,
                           mirror_image_get, &h_mirror_image_get);
   cls_register_cxx_method(h_class, "mirror_image_set",
diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc
index 30b872b..5fe8452 100644
--- a/src/cls/rbd/cls_rbd_client.cc
+++ b/src/cls/rbd/cls_rbd_client.cc
@@ -1132,8 +1132,12 @@ namespace librbd {
     }
 
     int mirror_image_list(librados::IoCtx *ioctx,
-			  std::vector<std::string> *image_ids) {
+		          const std::string &start, uint64_t max_return,
+			  std::map<std::string, std::string> *mirror_image_ids) {
       bufferlist in_bl;
+      ::encode(start, in_bl);
+      ::encode(max_return, in_bl);
+
       bufferlist out_bl;
       int r = ioctx->exec(RBD_MIRRORING, "rbd", "mirror_image_list", in_bl,
 			  out_bl);
@@ -1141,16 +1145,48 @@ namespace librbd {
         return r;
       }
 
-      image_ids->clear();
       try {
         bufferlist::iterator bl_it = out_bl.begin();
-        ::decode(*image_ids, bl_it);
+        ::decode(*mirror_image_ids, bl_it);
       } catch (const buffer::error &err) {
         return -EBADMSG;
       }
       return 0;
     }
 
+    void mirror_image_get_image_id_start(librados::ObjectReadOperation *op,
+                                         const std::string &global_image_id) {
+      bufferlist in_bl;
+      ::encode(global_image_id, in_bl);
+      op->exec( "rbd", "mirror_image_get_image_id", in_bl);
+    }
+
+    int mirror_image_get_image_id_finish(bufferlist::iterator *it,
+                                         std::string *image_id) {
+      try {
+	::decode(*image_id, *it);
+      } catch (const buffer::error &err) {
+	return -EBADMSG;
+      }
+      return 0;
+    }
+
+    int mirror_image_get_image_id(librados::IoCtx *ioctx,
+                                  const std::string &global_image_id,
+                                  std::string *image_id) {
+      librados::ObjectReadOperation op;
+      mirror_image_get_image_id_start(&op, global_image_id);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(RBD_MIRRORING, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return mirror_image_get_image_id_finish(&it, image_id);
+    }
+
     int mirror_image_get(librados::IoCtx *ioctx, const std::string &image_id,
 			 cls::rbd::MirrorImage *mirror_image) {
       bufferlist in_bl;
diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h
index 3248f78..8bc8ee2 100644
--- a/src/cls/rbd/cls_rbd_client.h
+++ b/src/cls/rbd/cls_rbd_client.h
@@ -228,7 +228,15 @@ namespace librbd {
                                 const std::string &uuid,
                                 const std::string &cluster_name);
     int mirror_image_list(librados::IoCtx *ioctx,
-			  std::vector<std::string> *image_ids);
+		          const std::string &start, uint64_t max_return,
+                          std::map<std::string, std::string> *mirror_image_ids);
+    void mirror_image_get_image_id_start(librados::ObjectReadOperation *op,
+                                         const std::string &global_image_id);
+    int mirror_image_get_image_id_finish(bufferlist::iterator *it,
+                                         std::string *image_id);
+    int mirror_image_get_image_id(librados::IoCtx *ioctx,
+                                  const std::string &global_image_id,
+                                  std::string *image_id);
     int mirror_image_get(librados::IoCtx *ioctx, const std::string &image_id,
 			 cls::rbd::MirrorImage *mirror_image);
     int mirror_image_set(librados::IoCtx *ioctx, const std::string &image_id,
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index f59658c..f3432da 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -1073,6 +1073,9 @@ public:
     initialized = true;
   }
 
+  void set_epoch(uint64_t epoch) {
+    instance_entry.versioned_epoch = epoch;
+  }
 
   int unlink_list_entry() {
     string list_idx;
@@ -1574,12 +1577,27 @@ static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in,
     return ret;
   }
 
-  ret = olh.init(NULL);
+  bool olh_found;
+  ret = olh.init(&olh_found);
   if (ret < 0) {
     CLS_LOG(0, "ERROR: olh.init() returned ret=%d", ret);
     return ret;
   }
 
+  if (!olh_found) {
+    bool instance_only = false;
+    cls_rgw_obj_key key(dest_key.name);
+    ret = convert_plain_entry_to_versioned(hctx, key, true, instance_only);
+    if (ret < 0) {
+      CLS_LOG(0, "ERROR: convert_plain_entry_to_versioned ret=%d", ret);
+      return ret;
+    }
+    olh.update(dest_key, false);
+    olh.set_tag(op.olh_tag);
+
+    obj.set_epoch(1);
+  }
+
   if (!olh.start_modify(op.olh_epoch)) {
     ret = obj.unlink_list_entry();
     if (ret < 0) {
diff --git a/src/cls/rgw/cls_rgw_client.cc b/src/cls/rgw/cls_rgw_client.cc
index 40041aa..1cf1156 100644
--- a/src/cls/rgw/cls_rgw_client.cc
+++ b/src/cls/rgw/cls_rgw_client.cc
@@ -332,13 +332,14 @@ int cls_rgw_bucket_link_olh(librados::IoCtx& io_ctx, const string& oid, const cl
 
 int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid,
                                    const cls_rgw_obj_key& key, const string& op_tag,
-                                   uint64_t olh_epoch, bool log_op)
+                                   const string& olh_tag, uint64_t olh_epoch, bool log_op)
 {
   bufferlist in, out;
   struct rgw_cls_unlink_instance_op call;
   call.key = key;
   call.op_tag = op_tag;
   call.olh_epoch = olh_epoch;
+  call.olh_tag = olh_tag;
   call.log_op = log_op;
   ::encode(call, in);
   int r = io_ctx.exec(oid, "rgw", "bucket_unlink_instance", in, out);
diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h
index 130a009..1b02a5e 100644
--- a/src/cls/rgw/cls_rgw_client.h
+++ b/src/cls/rgw/cls_rgw_client.h
@@ -333,7 +333,7 @@ int cls_rgw_bucket_link_olh(librados::IoCtx& io_ctx, const string& oid, const cl
                             bool delete_marker, const string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
                             uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time, bool log_op);
 int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, const string& op_tag,
-                                   uint64_t olh_epoch, bool log_op);
+                                   const string& olh_tag, uint64_t olh_epoch, bool log_op);
 int cls_rgw_get_olh_log(librados::IoCtx& io_ctx, string& oid, librados::ObjectReadOperation& op, const cls_rgw_obj_key& olh, uint64_t ver_marker,
                         const string& olh_tag,
                         map<uint64_t, vector<struct rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
diff --git a/src/cls/rgw/cls_rgw_ops.h b/src/cls/rgw/cls_rgw_ops.h
index 3f2c8d9..e8a7661 100644
--- a/src/cls/rgw/cls_rgw_ops.h
+++ b/src/cls/rgw/cls_rgw_ops.h
@@ -222,26 +222,31 @@ struct rgw_cls_unlink_instance_op {
   uint64_t olh_epoch;
   bool log_op;
   uint16_t bilog_flags;
+  string olh_tag;
 
   rgw_cls_unlink_instance_op() : olh_epoch(0), log_op(false), bilog_flags(0) {}
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(key, bl);
     ::encode(op_tag, bl);
     ::encode(olh_epoch, bl);
     ::encode(log_op, bl);
     ::encode(bilog_flags, bl);
+    ::encode(olh_tag, bl);
     ENCODE_FINISH(bl);
   }
 
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     ::decode(key, bl);
     ::decode(op_tag, bl);
     ::decode(olh_epoch, bl);
     ::decode(log_op, bl);
     ::decode(bilog_flags, bl);
+    if (struct_v >= 2) {
+      ::decode(olh_tag, bl);
+    }
     DECODE_FINISH(bl);
   }
 
diff --git a/src/common/Cycles.cc b/src/common/Cycles.cc
index b0b687e..656f08b 100644
--- a/src/common/Cycles.cc
+++ b/src/common/Cycles.cc
@@ -38,14 +38,15 @@
 #include "Cycles.h"
 
 double Cycles::cycles_per_sec = 0;
-static Initialize _(Cycles::init);
 
 /**
  * Perform once-only overall initialization for the Cycles class, such
- * as calibrating the clock frequency.  This method is invoked automatically
- * during initialization, but it may be invoked explicitly by other modules
- * to ensure that initialization occurs before those modules initialize
- * themselves.
+ * as calibrating the clock frequency.  This method must be called
+ * before using the Cycles module.
+ *
+ * It is not initialized by default because the timing loops cause
+ * general process startup times to balloon
+ * (http://tracker.ceph.com/issues/15225).
  */
 void Cycles::init()
 {
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index 37e918a..2554c09 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -87,14 +87,23 @@ void OpHistory::dump_ops(utime_t now, Formatter *f)
   f->close_section();
 }
 
-void OpTracker::dump_historic_ops(Formatter *f)
+bool OpTracker::dump_historic_ops(Formatter *f)
 {
+  RWLock::RLocker l(lock);
+  if (!tracking_enabled)
+    return false;
+
   utime_t now = ceph_clock_now(cct);
   history.dump_ops(now, f);
+  return true;
 }
 
-void OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked)
+bool OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked)
 {
+  RWLock::RLocker l(lock);
+  if (!tracking_enabled)
+    return false;
+
   f->open_object_section("ops_in_flight"); // overall dump
   uint64_t total_ops_in_flight = 0;
   f->open_array_section("ops"); // list of TrackedOps
@@ -119,12 +128,14 @@ void OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked)
   } else
     f->dump_int("num_ops", total_ops_in_flight);
   f->close_section(); // overall dump
+  return true;
 }
 
-void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
+bool OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
 {
-  // caller checks;
-  assert(tracking_enabled);
+  RWLock::RLocker l(lock);
+  if (!tracking_enabled)
+    return false;
 
   uint64_t current_seq = seq.inc();
   uint32_t shard_index = current_seq % num_optracker_shards;
@@ -135,6 +146,7 @@ void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
     sdata->ops_in_flight_sharded.push_back(i);
     sdata->ops_in_flight_sharded.back()->seq = current_seq;
   }
+  return true;
 }
 
 void OpTracker::unregister_inflight_op(TrackedOp *i)
@@ -152,6 +164,7 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
   }
   i->_unregistered();
 
+  RWLock::RLocker l(lock);
   if (!tracking_enabled)
     delete i;
   else {
@@ -312,6 +325,9 @@ void TrackedOp::mark_event(const string &event)
 
 void TrackedOp::dump(utime_t now, Formatter *f) const
 {
+  // Ignore if still in the constructor
+  if (!is_tracked)
+    return;
   stringstream name;
   _dump_op_descriptor_unlocked(name);
   f->dump_string("description", name.str().c_str()); // this TrackedOp
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index c1d8eb6..9d0ff88 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -22,6 +22,7 @@
 #include "msg/Message.h"
 #include "include/memory.h"
 #include "common/RWLock.h"
+#include <atomic>
 
 class TrackedOp;
 typedef ceph::shared_ptr<TrackedOp> TrackedOpRef;
@@ -74,16 +75,16 @@ class OpTracker {
   float complaint_time;
   int log_threshold;
   void _mark_event(TrackedOp *op, const string &evt, utime_t now);
+  bool tracking_enabled;
+  RWLock       lock;
 
 public:
-  bool tracking_enabled;
   CephContext *cct;
-  RWLock       lock;
   OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards) : seq(0), 
                                      num_optracker_shards(num_shards),
 				     complaint_time(0), log_threshold(0),
-				     tracking_enabled(tracking), cct(cct_),
-				     lock("OpTracker::lock") {
+				     tracking_enabled(tracking),
+				     lock("OpTracker::lock"), cct(cct_) {
 
     for (uint32_t i = 0; i < num_optracker_shards; i++) {
       char lock_name[32] = {0};
@@ -104,9 +105,9 @@ public:
     RWLock::WLocker l(lock);
     tracking_enabled = enable;
   }
-  void dump_ops_in_flight(Formatter *f, bool print_only_blocked=false);
-  void dump_historic_ops(Formatter *f);
-  void register_inflight_op(xlist<TrackedOp*>::item *i);
+  bool dump_ops_in_flight(Formatter *f, bool print_only_blocked=false);
+  bool dump_historic_ops(Formatter *f);
+  bool register_inflight_op(xlist<TrackedOp*>::item *i);
   void unregister_inflight_op(TrackedOp *i);
 
   void get_age_ms_histogram(pow2_hist_t *h);
@@ -139,6 +140,7 @@ public:
   {
     typename T::Ref retval(new T(params, this),
 			   RemoveOnDelete(this));
+    retval->tracking_start();
     return retval;
   }
 };
@@ -158,7 +160,8 @@ protected:
   uint64_t seq; /// a unique value set by the OpTracker
 
   uint32_t warn_interval_multiplier; // limits output of a given op warning
-  bool is_tracked; //whether in tracker
+  // Transitions from false -> true without locks being held
+  atomic<bool> is_tracked; //whether in tracker and out of constructor
   TrackedOp(OpTracker *_tracker, const utime_t& initiated) :
     xitem(this),
     tracker(_tracker),
@@ -167,14 +170,7 @@ protected:
     seq(0),
     warn_interval_multiplier(1),
     is_tracked(false)
-  {
-    RWLock::RLocker l(tracker->lock);
-    if (tracker->tracking_enabled) {
-      tracker->register_inflight_op(&xitem);
-      events.push_back(make_pair(initiated_at, "initiated"));
-      is_tracked = true;
-    }
-  }
+  { }
 
   /// output any type-specific data you want to get when dump() is called
   virtual void _dump(utime_t now, Formatter *f) const {}
@@ -193,6 +189,7 @@ public:
   }
 
   double get_duration() const {
+    Mutex::Locker l(lock);
     if (!events.empty() && events.rbegin()->second.compare("done") == 0)
       return events.rbegin()->first - get_initiated();
     else
@@ -201,9 +198,16 @@ public:
 
   void mark_event(const string &event);
   virtual const char *state_string() const {
+    Mutex::Locker l(lock);
     return events.rbegin()->second.c_str();
   }
   void dump(utime_t now, Formatter *f) const;
+  void tracking_start() {
+    if (tracker->register_inflight_op(&xitem)) {
+      events.push_back(make_pair(initiated_at, "initiated"));
+      is_tracked = true;
+    }
+  }
 };
 
 #endif
diff --git a/src/common/config.h b/src/common/config.h
index 2977f42..d844395 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -185,7 +185,13 @@ private:
 
   bool expand_meta(std::string &val,
 		   std::ostream *oss) const;
-
+public:  // for global_init
+  bool early_expand_meta(std::string &val,
+			 std::ostream *oss) const {
+    Mutex::Locker l(lock);
+    return expand_meta(val, oss);
+  }
+private:
   bool expand_meta(std::string &val,
 		   config_option *opt,
 		   std::list<config_option *> stack,
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index aea9c16..50356c7 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -31,7 +31,7 @@ OPTION(crushtool, OPT_STR, "crushtool") // crushtool utility path
 OPTION(daemonize, OPT_BOOL, false) // default changed by common_preinit()
 OPTION(setuser, OPT_STR, "")        // uid or user name
 OPTION(setgroup, OPT_STR, "")        // gid or group name
-OPTION(setuser_match_path, OPT_STR, "")  // make setuser/group conditional on this patch matching ownership
+OPTION(setuser_match_path, OPT_STR, "")  // make setuser/group conditional on this path matching ownership
 OPTION(pid_file, OPT_STR, "") // default changed by common_preinit()
 OPTION(chdir, OPT_STR, "/")
 OPTION(max_open_files, OPT_LONGLONG, 0)
@@ -312,6 +312,7 @@ OPTION(mon_osd_min_down_reporters, OPT_INT, 2)   // number of OSDs from differen
 OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host")   // in which level of parent bucket the reporters are counted
 OPTION(mon_osd_force_trim_to, OPT_INT, 0)   // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
 OPTION(mon_mds_force_trim_to, OPT_INT, 0)   // force mon to trim mdsmaps to this point (dangerous, use with care)
+OPTION(mon_mds_skip_sanity, OPT_BOOL, false)  // skip safety assertions on FSMap (in case of bugs where we want to continue anyway)
 
 // monitor debug options
 OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL, false) // consider deprecated commands as obsolete
@@ -385,6 +386,7 @@ OPTION(client_oc_max_dirty, OPT_INT, 1024*1024* 100)    // MB * n  (dirty OR tx.
 OPTION(client_oc_target_dirty, OPT_INT, 1024*1024* 8) // target dirty (keep this smallish)
 OPTION(client_oc_max_dirty_age, OPT_DOUBLE, 5.0)      // max age in cache before writeback
 OPTION(client_oc_max_objects, OPT_INT, 1000)      // max objects in cache
+OPTION(client_debug_getattr_caps, OPT_BOOL, false) // check if MDS reply contains wanted caps
 OPTION(client_debug_force_sync_read, OPT_BOOL, false)     // always read synchronously (go to osds)
 OPTION(client_debug_inject_tick_delay, OPT_INT, 0) // delay the client tick for a number of seconds
 OPTION(client_max_inline_size, OPT_U64, 4096)
@@ -520,6 +522,7 @@ OPTION(mds_skip_ino, OPT_INT, 0)
 OPTION(max_mds, OPT_INT, 1)
 OPTION(mds_standby_for_name, OPT_STR, "")
 OPTION(mds_standby_for_rank, OPT_INT, -1)
+OPTION(mds_standby_for_fscid, OPT_INT, -1)
 OPTION(mds_standby_replay, OPT_BOOL, false)
 OPTION(mds_enable_op_tracker, OPT_BOOL, true) // enable/disable MDS op tracking
 OPTION(mds_op_history_size, OPT_U32, 20)    // Max number of completed ops to track
@@ -803,7 +806,7 @@ OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be
 
 // determines whether PGLog::check() compares written out log to stored log
 OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false)
-
+OPTION(osd_loop_before_reset_tphandle, OPT_U32, 64) // Max number of loop before we reset thread-pool's handle
 // default timeout while caling WaitInterval on an empty queue
 OPTION(threadpool_default_timeout, OPT_INT, 60)
 // default wait time for an empty queue before pinging the hb timeout
@@ -963,6 +966,7 @@ OPTION(bluestore_debug_small_allocations, OPT_INT, 0)
 OPTION(bluestore_debug_freelist, OPT_BOOL, false)
 OPTION(bluestore_debug_prefill, OPT_FLOAT, 0)
 OPTION(bluestore_debug_prefragment_max, OPT_INT, 1048576)
+OPTION(bluestore_inject_wal_apply_delay, OPT_FLOAT, 0)
 
 OPTION(kstore_max_ops, OPT_U64, 512)
 OPTION(kstore_max_bytes, OPT_U64, 64*1024*1024)
@@ -1058,18 +1062,18 @@ OPTION(filestore_queue_max_bytes, OPT_U64, 100 << 20)
 OPTION(filestore_caller_concurrency, OPT_INT, 10)
 
 /// Expected filestore throughput in B/s
-OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE, 100 << 20)
+OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE, 200 << 20)
 /// Expected filestore throughput in ops/s
-OPTION(filestore_expected_throughput_ops, OPT_DOUBLE, 100)
+OPTION(filestore_expected_throughput_ops, OPT_DOUBLE, 200)
 
-/// Filestore max delay multiple (probably don't need to change)
-OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE, 10)
-/// Filestore max delay multiple (probably don't need to change)
-OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE, 2)
+/// Filestore max delay multiple.  Defaults to 0 (disabled)
+OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE, 0)
+/// Filestore high delay multiple.  Defaults to 0 (disabled)
+OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE, 0)
 
 /// Use above to inject delays intended to keep the op queue between low and high
-OPTION(filestore_queue_low_threshhold, OPT_DOUBLE, 0.2)
-OPTION(filestore_queue_high_threshhold, OPT_DOUBLE, 0.8)
+OPTION(filestore_queue_low_threshhold, OPT_DOUBLE, 0.3)
+OPTION(filestore_queue_high_threshhold, OPT_DOUBLE, 0.9)
 
 OPTION(filestore_op_threads, OPT_INT, 2)
 OPTION(filestore_op_thread_timeout, OPT_INT, 60)
@@ -1102,13 +1106,13 @@ OPTION(journal_max_write_bytes, OPT_INT, 10 << 20)
 OPTION(journal_max_write_entries, OPT_INT, 100)
 
 /// Target range for journal fullness
-OPTION(journal_throttle_low_threshhold, OPT_DOUBLE, 0.5)
-OPTION(journal_throttle_high_threshhold, OPT_DOUBLE, 0.8)
+OPTION(journal_throttle_low_threshhold, OPT_DOUBLE, 0.6)
+OPTION(journal_throttle_high_threshhold, OPT_DOUBLE, 0.9)
 
-/// Multiple over expected at high_threshhold (probably don't need to change)
-OPTION(journal_throttle_high_multiple, OPT_DOUBLE, 2)
-/// Multiple over expected at max (probably don't need to change)
-OPTION(journal_throttle_max_multiple, OPT_DOUBLE, 10)
+/// Multiple over expected at high_threshhold. Defaults to 0 (disabled).
+OPTION(journal_throttle_high_multiple, OPT_DOUBLE, 0)
+/// Multiple over expected at max.  Defaults to 0 (disabled).
+OPTION(journal_throttle_max_multiple, OPT_DOUBLE, 0)
 
 OPTION(journal_align_min_size, OPT_INT, 64 << 10)  // align data payloads >= this.
 OPTION(journal_replay_from, OPT_INT, 0)
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 9fb4202..5748078 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -116,7 +116,7 @@ int CrushWrapper::can_rename_item(const string& srcname,
     if (is_valid_crush_name(dstname)) {
       return 0;
     } else {
-      *ss << "srcname = '" << srcname << "' does not match [-_.0-9a-zA-Z]+";
+      *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
       return -EINVAL;
     }
   } else {
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index bd5f606..5460eef 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -195,8 +195,11 @@ void global_init(std::vector < const char * > *alt_def_args,
     }
     if ((uid || gid) &&
 	g_conf->setuser_match_path.length()) {
+      // induce early expansion of setuser_match_path config option
+      string match_path = g_conf->setuser_match_path;
+      g_conf->early_expand_meta(match_path, &cerr);
       struct stat st;
-      int r = ::stat(g_conf->setuser_match_path.c_str(), &st);
+      int r = ::stat(match_path.c_str(), &st);
       if (r < 0) {
 	r = -errno;
 	cerr << "unable to stat setuser_match_path "
@@ -206,7 +209,7 @@ void global_init(std::vector < const char * > *alt_def_args,
       }
       if ((uid && uid != st.st_uid) ||
 	  (gid && gid != st.st_gid)) {
-	cerr << "WARNING: will not setuid/gid: " << g_conf->setuser_match_path
+	cerr << "WARNING: will not setuid/gid: " << match_path
 	     << " owned by " << st.st_uid << ":" << st.st_gid
 	     << " and not requested " << uid << ":" << gid
 	     << std::endl;
@@ -216,7 +219,7 @@ void global_init(std::vector < const char * > *alt_def_args,
 	gid_string.erase();
       } else {
 	priv_ss << "setuser_match_path "
-		<< g_conf->setuser_match_path << " owned by "
+		<< match_path << " owned by "
 		<< st.st_uid << ":" << st.st_gid << ". ";
       }
     }
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index 6254646..e18bca1 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -91,14 +91,19 @@ static void handle_fatal_signal(int signum)
   // case, SA_RESETHAND specifies that the default signal handler--
   // presumably dump core-- will handle it.
   char buf[1024];
+  char pthread_name[16] = {0}; //limited by 16B include terminating null byte.
+  int r = pthread_getname_np(pthread_self(), pthread_name, sizeof(pthread_name));
+  (void)r;
 #if defined(__sun)
   char message[SIG2STR_MAX];
   sig2str(signum,message);
   snprintf(buf, sizeof(buf), "*** Caught signal (%s) **\n "
-	    "in thread %llx\n", message, (unsigned long long)pthread_self());
+	    "in thread %llx thread_name:%s\n", message, (unsigned long long)pthread_self(),
+	    pthread_name);
 #else
   snprintf(buf, sizeof(buf), "*** Caught signal (%s) **\n "
-	    "in thread %llx\n", sig_str(signum), (unsigned long long)pthread_self());
+	    "in thread %llx thread_name:%s\n", sig_str(signum), (unsigned long long)pthread_self(),
+	    pthread_name);
 #endif
   dout_emergency(buf);
   pidfile_remove();
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 47e6a0b..593fd5f 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -414,7 +414,7 @@ union ceph_mds_request_args {
 		__le32 stripe_count;         /* ... */
 		__le32 object_size;
 		__le32 pool;                 /* if >= 0 and CREATEPOOLID feature */
-		__le32 unused;               /* used to be preferred */
+		__le32 mask;                 /* CEPH_CAP_* */
 		__le64 old_size;             /* if O_TRUNC */
 	} __attribute__ ((packed)) open;
 	struct {
diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h
index 5eaf291..4bb8a93 100644
--- a/src/include/rados/rgw_file.h
+++ b/src/include/rados/rgw_file.h
@@ -128,15 +128,23 @@ int rgw_statfs(struct rgw_fs *rgw_fs,
 	       uint32_t flags);
 
 
+/* XXX (get|set)attr mask bits */
+#define RGW_SETATTR_MODE   1
+#define RGW_SETATTR_UID    2
+#define RGW_SETATTR_GID    4
+#define RGW_SETATTR_MTIME  8
+#define RGW_SETATTR_ATIME 16
+#define RGW_SETATTR_SIZE  32
+#define RGW_SETATTR_CTIME 64
+
 /*
   create file
 */
 #define RGW_CREATE_FLAG_NONE     0x0000
 
-int rgw_create(struct rgw_fs *rgw_fs,
-	       struct rgw_file_handle *parent_fh,
-	       const char *name, mode_t mode, struct stat *st,
-	       struct rgw_file_handle **fh, uint32_t flags);
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	      const char *name, struct stat *st, uint32_t mask,
+	      struct rgw_file_handle **fh, uint32_t flags);
 
 /*
   create a new directory
@@ -145,7 +153,7 @@ int rgw_create(struct rgw_fs *rgw_fs,
 
 int rgw_mkdir(struct rgw_fs *rgw_fs,
 	      struct rgw_file_handle *parent_fh,
-	      const char *name, mode_t mode, struct stat *st,
+	      const char *name, struct stat *st, uint32_t mask,
 	      struct rgw_file_handle **fh, uint32_t flags);
 
 /*
@@ -180,15 +188,6 @@ int rgw_readdir(struct rgw_fs *rgw_fs,
 		rgw_readdir_cb rcb, void *cb_arg, bool *eof,
 		uint32_t flags);
 
-/* XXX (get|set)attr mask bits */
-#define RGW_SETATTR_MODE   1
-#define RGW_SETATTR_UID    2
-#define RGW_SETATTR_GID    4
-#define RGW_SETATTR_MTIME  8
-#define RGW_SETATTR_ATIME 16
-#define RGW_SETATTR_SIZE  32
-#define RGW_SETATTR_CTIME 64
-
 /*
    get unix attributes for object
 */
diff --git a/src/journal/JournalMetadata.cc b/src/journal/JournalMetadata.cc
index 830b4ca..ba85f4b 100644
--- a/src/journal/JournalMetadata.cc
+++ b/src/journal/JournalMetadata.cc
@@ -19,6 +19,62 @@ using namespace cls::journal;
 
 namespace {
 
+struct C_GetClient : public Context {
+  CephContext *cct;
+  librados::IoCtx &ioctx;
+  const std::string &oid;
+  AsyncOpTracker &async_op_tracker;
+  std::string client_id;
+  cls::journal::Client *client;
+  Context *on_finish;
+
+  bufferlist out_bl;
+
+  C_GetClient(CephContext *cct, librados::IoCtx &ioctx, const std::string &oid,
+              AsyncOpTracker &async_op_tracker, const std::string &client_id,
+              cls::journal::Client *client, Context *on_finish)
+    : cct(cct), ioctx(ioctx), oid(oid), async_op_tracker(async_op_tracker),
+      client_id(client_id), client(client), on_finish(on_finish) {
+    async_op_tracker.start_op();
+  }
+  virtual ~C_GetClient() {
+    async_op_tracker.finish_op();
+  }
+
+  virtual void send() {
+    send_get_client();
+  }
+
+  void send_get_client() {
+    ldout(cct, 20) << "C_GetClient: " << __func__ << dendl;
+
+    librados::ObjectReadOperation op;
+    client::get_client_start(&op, client_id);
+
+    librados::AioCompletion *comp = librados::Rados::aio_create_completion(
+      this, nullptr, &utils::rados_state_callback<
+        C_GetClient, &C_GetClient::handle_get_client>);
+
+    int r = ioctx.aio_operate(oid, comp, &op, &out_bl);
+    assert(r == 0);
+    comp->release();
+  }
+
+  void handle_get_client(int r) {
+    ldout(cct, 20) << "C_GetClient: " << __func__ << ": r=" << r << dendl;
+
+    if (r == 0) {
+      bufferlist::iterator it = out_bl.begin();
+      r = client::get_client_finish(&it, client);
+    }
+    complete(r);
+  }
+
+  virtual void finish(int r) override {
+    on_finish->complete(r);
+  }
+};
+
 struct C_AllocateTag : public Context {
   CephContext *cct;
   librados::IoCtx &ioctx;
@@ -142,6 +198,58 @@ struct C_AllocateTag : public Context {
   }
 };
 
+struct C_GetTag : public Context {
+  CephContext *cct;
+  librados::IoCtx &ioctx;
+  const std::string &oid;
+  AsyncOpTracker &async_op_tracker;
+  uint64_t tag_tid;
+  JournalMetadata::Tag *tag;
+  Context *on_finish;
+
+  bufferlist out_bl;
+
+  C_GetTag(CephContext *cct, librados::IoCtx &ioctx, const std::string &oid,
+           AsyncOpTracker &async_op_tracker, uint64_t tag_tid,
+           JournalMetadata::Tag *tag, Context *on_finish)
+    : cct(cct), ioctx(ioctx), oid(oid), async_op_tracker(async_op_tracker),
+      tag_tid(tag_tid), tag(tag), on_finish(on_finish) {
+    async_op_tracker.start_op();
+  }
+  virtual ~C_GetTag() {
+    async_op_tracker.finish_op();
+  }
+
+  void send() {
+    send_get_tag();
+  }
+
+  void send_get_tag() {
+    librados::ObjectReadOperation op;
+    client::get_tag_start(&op, tag_tid);
+
+    librados::AioCompletion *comp = librados::Rados::aio_create_completion(
+      this, nullptr, &utils::rados_state_callback<
+        C_GetTag, &C_GetTag::handle_get_tag>);
+
+    int r = ioctx.aio_operate(oid, comp, &op, &out_bl);
+    assert(r == 0);
+    comp->release();
+  }
+
+  void handle_get_tag(int r) {
+    if (r == 0) {
+      bufferlist::iterator iter = out_bl.begin();
+      r = client::get_tag_finish(&iter, tag);
+    }
+    complete(r);
+  }
+
+  virtual void finish(int r) override {
+    on_finish->complete(r);
+  }
+};
+
 struct C_GetTags : public Context {
   CephContext *cct;
   librados::IoCtx &ioctx;
@@ -365,6 +473,20 @@ void JournalMetadata::allocate_tag(uint64_t tag_class, const bufferlist &data,
   ctx->send();
 }
 
+void JournalMetadata::get_client(const std::string &client_id,
+                                 cls::journal::Client *client,
+                                 Context *on_finish) {
+  C_GetClient *ctx = new C_GetClient(m_cct, m_ioctx, m_oid, m_async_op_tracker,
+                                     client_id, client, on_finish);
+  ctx->send();
+}
+
+void JournalMetadata::get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish) {
+  C_GetTag *ctx = new C_GetTag(m_cct, m_ioctx, m_oid, m_async_op_tracker,
+                               tag_tid, tag, on_finish);
+  ctx->send();
+}
+
 void JournalMetadata::get_tags(const boost::optional<uint64_t> &tag_class,
                                Tags *tags, Context *on_finish) {
   C_GetTags *ctx = new C_GetTags(m_cct, m_ioctx, m_oid, m_client_id,
diff --git a/src/journal/JournalMetadata.h b/src/journal/JournalMetadata.h
index 8bedb98..d95f1ed 100644
--- a/src/journal/JournalMetadata.h
+++ b/src/journal/JournalMetadata.h
@@ -68,9 +68,12 @@ public:
   void register_client(const bufferlist &data, Context *on_finish);
   void update_client(const bufferlist &data, Context *on_finish);
   void unregister_client(Context *on_finish);
+  void get_client(const std::string &client_id, cls::journal::Client *client,
+                  Context *on_finish);
 
   void allocate_tag(uint64_t tag_class, const bufferlist &data,
                     Tag *tag, Context *on_finish);
+  void get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish);
   void get_tags(const boost::optional<uint64_t> &tag_class, Tags *tags,
                 Context *on_finish);
 
diff --git a/src/journal/Journaler.cc b/src/journal/Journaler.cc
index 957243b..0981db8 100644
--- a/src/journal/Journaler.cc
+++ b/src/journal/Journaler.cc
@@ -248,6 +248,12 @@ void Journaler::unregister_client(Context *on_finish) {
   return m_metadata->unregister_client(on_finish);
 }
 
+void Journaler::get_client(const std::string &client_id,
+                           cls::journal::Client *client,
+                           Context *on_finish) {
+  m_metadata->get_client(client_id, client, on_finish);
+}
+
 int Journaler::get_cached_client(const std::string &client_id,
                                  cls::journal::Client *client) {
   RegisteredClients clients;
@@ -273,6 +279,10 @@ void Journaler::allocate_tag(uint64_t tag_class, const bufferlist &data,
   m_metadata->allocate_tag(tag_class, data, tag, on_finish);
 }
 
+void Journaler::get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish) {
+  m_metadata->get_tag(tag_tid, tag, on_finish);
+}
+
 void Journaler::get_tags(uint64_t tag_class, Tags *tags, Context *on_finish) {
   m_metadata->get_tags(tag_class, tags, on_finish);
 }
diff --git a/src/journal/Journaler.h b/src/journal/Journaler.h
index c0bb137..055b8bf 100644
--- a/src/journal/Journaler.h
+++ b/src/journal/Journaler.h
@@ -41,6 +41,7 @@ public:
     Mutex timer_lock;
   };
 
+  typedef cls::journal::Tag Tag;
   typedef std::list<cls::journal::Tag> Tags;
   typedef std::set<cls::journal::Client> RegisteredClients;
 
@@ -76,7 +77,8 @@ public:
   void unregister_client(Context *on_finish);
 
   void update_client(const bufferlist &data, Context *on_finish);
-
+  void get_client(const std::string &client_id, cls::journal::Client *client,
+                  Context *on_finish);
   int get_cached_client(const std::string &client_id,
                         cls::journal::Client *client);
 
@@ -86,6 +88,7 @@ public:
                     Context *on_finish);
   void allocate_tag(uint64_t tag_class, const bufferlist &data,
                     cls::journal::Tag *tag, Context *on_finish);
+  void get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish);
   void get_tags(uint64_t tag_class, Tags *tags, Context *on_finish);
 
   void start_replay(ReplayHandler *replay_handler);
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index 037a018..b01cf22 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -231,6 +231,11 @@ public:
     return cwd.c_str();
   }
 
+  int chdir(const char *to)
+  {
+    return client->chdir(to, cwd);
+  }
+
   CephContext *get_ceph_context() const {
     return cct;
   }
@@ -454,7 +459,7 @@ extern "C" int ceph_chdir (struct ceph_mount_info *cmount, const char *s)
 {
   if (!cmount->is_mounted())
     return -ENOTCONN;
-  return cmount->get_client()->chdir(s);
+  return cmount->chdir(s);
 }
 
 extern "C" int ceph_opendir(struct ceph_mount_info *cmount,
diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc
index bd945e6..82d0042 100644
--- a/src/librbd/ExclusiveLock.cc
+++ b/src/librbd/ExclusiveLock.cc
@@ -373,7 +373,8 @@ void ExclusiveLock<I>::handle_acquire_lock(int r) {
 
     Action action = get_active_action();
     assert(action == ACTION_TRY_LOCK || action == ACTION_REQUEST_LOCK);
-    if (action == ACTION_REQUEST_LOCK && r < 0 && r != -EBLACKLISTED) {
+    if (action == ACTION_REQUEST_LOCK && r < 0 && r != -EBLACKLISTED &&
+        r != -EPERM) {
       m_state = STATE_WAITING_FOR_PEER;
       m_lock.Unlock();
 
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 0f9b5a5..770e871 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -15,11 +15,13 @@
 #include "librbd/AsyncOperation.h"
 #include "librbd/AsyncRequest.h"
 #include "librbd/ExclusiveLock.h"
+#include "librbd/exclusive_lock/StandardPolicy.h"
 #include "librbd/internal.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageState.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/Journal.h"
+#include "librbd/journal/StandardPolicy.h"
 #include "librbd/LibrbdAdminSocketHook.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/Operations.h"
@@ -187,6 +189,9 @@ struct C_InvalidateCache : public Context {
     op_work_queue = new ContextWQ("librbd::op_work_queue",
                                   cct->_conf->rbd_op_thread_timeout,
                                   thread_pool_singleton);
+
+    exclusive_lock_policy = new exclusive_lock::StandardPolicy(this);
+    journal_policy = new journal::StandardPolicy(this);
   }
 
   ImageCtx::~ImageCtx() {
@@ -218,6 +223,8 @@ struct C_InvalidateCache : public Context {
     op_work_queue->drain();
     aio_work_queue->drain();
 
+    delete journal_policy;
+    delete exclusive_lock_policy;
     delete op_work_queue;
     delete aio_work_queue;
     delete operations;
@@ -1046,4 +1053,30 @@ struct C_InvalidateCache : public Context {
     state->handle_update_notification();
     image_watcher->notify_header_update(on_finish);
   }
+
+  exclusive_lock::Policy *ImageCtx::get_exclusive_lock_policy() const {
+    assert(owner_lock.is_locked());
+    assert(exclusive_lock_policy != nullptr);
+    return exclusive_lock_policy;
+  }
+
+  void ImageCtx::set_exclusive_lock_policy(exclusive_lock::Policy *policy) {
+    assert(owner_lock.is_wlocked());
+    assert(policy != nullptr);
+    delete exclusive_lock_policy;
+    exclusive_lock_policy = policy;
+  }
+
+  journal::Policy *ImageCtx::get_journal_policy() const {
+    assert(snap_lock.is_locked());
+    assert(journal_policy != nullptr);
+    return journal_policy;
+  }
+
+  void ImageCtx::set_journal_policy(journal::Policy *policy) {
+    assert(snap_lock.is_wlocked());
+    assert(policy != nullptr);
+    delete journal_policy;
+    journal_policy = policy;
+  }
 }
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index a302a14..3b58c66 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -52,6 +52,9 @@ namespace librbd {
   class ObjectMap;
   template <typename> class Operations;
 
+  namespace exclusive_lock { struct Policy; }
+  namespace journal { struct Policy; }
+
   namespace operation {
   template <typename> class ResizeRequest;
   }
@@ -185,9 +188,22 @@ namespace librbd {
 
     LibrbdAdminSocketHook *asok_hook;
 
+    exclusive_lock::Policy *exclusive_lock_policy = nullptr;
+    journal::Policy *journal_policy = nullptr;
+
     static bool _filter_metadata_confs(const string &prefix, std::map<string, bool> &configs,
                                        map<string, bufferlist> &pairs, map<string, bufferlist> *res);
 
+    // unit test mock helpers
+    static ImageCtx* create(const std::string &image_name,
+                            const std::string &image_id,
+                            const char *snap, IoCtx& p, bool read_only) {
+      return new ImageCtx(image_name, image_id, snap, p, read_only);
+    }
+    void destroy() {
+      delete this;
+    }
+
     /**
      * Either image_name or image_id must be set.
      * If id is not known, pass the empty std::string,
@@ -281,6 +297,12 @@ namespace librbd {
 
     void notify_update();
     void notify_update(Context *on_finish);
+
+    exclusive_lock::Policy *get_exclusive_lock_policy() const;
+    void set_exclusive_lock_policy(exclusive_lock::Policy *policy);
+
+    journal::Policy *get_journal_policy() const;
+    void set_journal_policy(journal::Policy *policy);
   };
 }
 
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
index 0734490..9579723 100644
--- a/src/librbd/ImageWatcher.cc
+++ b/src/librbd/ImageWatcher.cc
@@ -11,6 +11,7 @@
 #include "librbd/Operations.h"
 #include "librbd/TaskFinisher.h"
 #include "librbd/Utils.h"
+#include "librbd/exclusive_lock/Policy.h"
 #include "librbd/image_watcher/Notifier.h"
 #include "librbd/image_watcher/NotifyLockOwner.h"
 #include "include/encoding.h"
@@ -407,7 +408,7 @@ void ImageWatcher::notify_request_lock() {
   ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
 
   bufferlist bl;
-  ::encode(NotifyMessage(RequestLockPayload(get_client_id())), bl);
+  ::encode(NotifyMessage(RequestLockPayload(get_client_id(), false)), bl);
   notify_lock_owner(std::move(bl), create_context_callback<
     ImageWatcher, &ImageWatcher::handle_request_lock>(this));
 }
@@ -615,7 +616,7 @@ bool ImageWatcher::handle_payload(const RequestLockPayload &payload,
 
     ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
                                << dendl;
-    m_image_ctx.exclusive_lock->release_lock(nullptr);
+    m_image_ctx.get_exclusive_lock_policy()->lock_requested(payload.force);
   }
   return true;
 }
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
index ec000f9..df30cb3 100644
--- a/src/librbd/Journal.cc
+++ b/src/librbd/Journal.cc
@@ -162,6 +162,7 @@ public:
 
 template <typename I, typename J>
 int open_journaler(I *image_ctx, J *journaler, bool *initialized,
+                   cls::journal::Client *client,
                    journal::ImageClientMeta *client_meta,
                    journal::TagData *tag_data) {
   C_SaferCond init_ctx;
@@ -172,14 +173,13 @@ int open_journaler(I *image_ctx, J *journaler, bool *initialized,
     return r;
   }
 
-  cls::journal::Client client;
-  r = journaler->get_cached_client(Journal<ImageCtx>::IMAGE_CLIENT_ID, &client);
+  r = journaler->get_cached_client(Journal<ImageCtx>::IMAGE_CLIENT_ID, client);
   if (r < 0) {
     return r;
   }
 
   librbd::journal::ClientData client_data;
-  bufferlist::iterator bl_it = client.data.begin();
+  bufferlist::iterator bl_it = client->data.begin();
   try {
     ::decode(client_data, bl_it);
   } catch (const buffer::error &err) {
@@ -207,6 +207,37 @@ int open_journaler(I *image_ctx, J *journaler, bool *initialized,
   return 0;
 }
 
+template <typename J>
+int allocate_journaler_tag(CephContext *cct, J *journaler,
+                           const cls::journal::Client &client,
+                           uint64_t tag_class,
+                           const journal::TagData &prev_tag_data,
+                           const std::string &mirror_uuid,
+                           cls::journal::Tag *new_tag) {
+  journal::TagData tag_data;
+  if (!client.commit_position.object_positions.empty()) {
+    auto position = client.commit_position.object_positions.front();
+    tag_data.predecessor_commit_valid = true;
+    tag_data.predecessor_tag_tid = position.tag_tid;
+    tag_data.predecessor_entry_tid = position.entry_tid;
+  }
+  tag_data.predecessor_mirror_uuid = prev_tag_data.mirror_uuid;
+  tag_data.mirror_uuid = mirror_uuid;
+
+  bufferlist tag_bl;
+  ::encode(tag_data, tag_bl);
+
+  C_SaferCond allocate_tag_ctx;
+  journaler->allocate_tag(tag_class, tag_bl, new_tag, &allocate_tag_ctx);
+
+  int r = allocate_tag_ctx.wait();
+  if (r < 0) {
+    lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
 } // anonymous namespace
 
 using util::create_async_context_callback;
@@ -313,7 +344,8 @@ bool Journal<I>::is_journal_supported(I &image_ctx) {
 template <typename I>
 int Journal<I>::create(librados::IoCtx &io_ctx, const std::string &image_id,
 		       uint8_t order, uint8_t splay_width,
-		       const std::string &object_pool) {
+		       const std::string &object_pool, bool non_primary,
+                       const std::string &primary_mirror_uuid) {
   CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
   ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
 
@@ -340,19 +372,16 @@ int Journal<I>::create(librados::IoCtx &io_ctx, const std::string &image_id,
     return r;
   }
 
-  // create tag class for this image's journal events
-  bufferlist tag_data;
-  ::encode(journal::TagData(), tag_data);
-
-  C_SaferCond tag_ctx;
+  cls::journal::Client client;
   cls::journal::Tag tag;
-  journaler.allocate_tag(cls::journal::Tag::TAG_CLASS_NEW, tag_data,
-                         &tag, &tag_ctx);
-  r = tag_ctx.wait();
-  if (r < 0) {
-    lderr(cct) << "failed to allocate journal tag: " << cpp_strerror(r)
-               << dendl;
-  }
+  journal::TagData tag_data;
+
+  assert(non_primary ^ primary_mirror_uuid.empty());
+  std::string mirror_uuid = (non_primary ? primary_mirror_uuid :
+                                           LOCAL_MIRROR_UUID);
+  r = allocate_journaler_tag(cct, &journaler, client,
+                             cls::journal::Tag::TAG_CLASS_NEW,
+                             tag_data, mirror_uuid, &tag);
 
   bufferlist client_data;
   ::encode(journal::ClientData{journal::ImageClientMeta{tag.tag_class}},
@@ -425,21 +454,27 @@ int Journal<I>::reset(librados::IoCtx &io_ctx, const std::string &image_id) {
   int64_t pool_id;
   journaler.get_metadata(&order, &splay_width, &pool_id);
 
+  std::string pool_name;
+  if (pool_id != -1) {
+    librados::Rados rados(io_ctx);
+    r = rados.pool_reverse_lookup(pool_id, &pool_name);
+    if (r < 0) {
+      lderr(cct) << "failed to lookup data pool: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
   r = journaler.remove(true);
   if (r < 0) {
     lderr(cct) << "failed to reset journal: " << cpp_strerror(r) << dendl;
     return r;
   }
-  r = journaler.create(order, splay_width, pool_id);
+
+  r = create(io_ctx, image_id, order, splay_width, pool_name, false, "");
   if (r < 0) {
     lderr(cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
     return r;
   }
-  r = journaler.register_client(bufferlist());
-  if (r < 0) {
-    lderr(cct) << "failed to register client: " << cpp_strerror(r) << dendl;
-    return r;
-  }
   return 0;
 }
 
@@ -464,10 +499,11 @@ int Journal<I>::get_tag_owner(I *image_ctx, std::string *mirror_uuid) {
                       image_ctx->cct->_conf->rbd_journal_commit_age);
 
   bool initialized;
+  cls::journal::Client client;
   journal::ImageClientMeta client_meta;
   journal::TagData tag_data;
-  int r = open_journaler(image_ctx, &journaler, &initialized, &client_meta,
-                         &tag_data);
+  int r = open_journaler(image_ctx, &journaler, &initialized, &client,
+                         &client_meta, &tag_data);
   if (r >= 0) {
     *mirror_uuid = tag_data.mirror_uuid;
   }
@@ -479,18 +515,19 @@ int Journal<I>::get_tag_owner(I *image_ctx, std::string *mirror_uuid) {
 }
 
 template <typename I>
-int Journal<I>::allocate_tag(I *image_ctx, const std::string &mirror_uuid) {
+int Journal<I>::request_resync(I *image_ctx) {
   CephContext *cct = image_ctx->cct;
-  ldout(cct, 20) << __func__ << ": mirror_uuid=" << mirror_uuid << dendl;
+  ldout(cct, 20) << __func__ << dendl;
 
   Journaler journaler(image_ctx->md_ctx, image_ctx->id, IMAGE_CLIENT_ID,
                       image_ctx->cct->_conf->rbd_journal_commit_age);
 
   bool initialized;
+  cls::journal::Client client;
   journal::ImageClientMeta client_meta;
   journal::TagData tag_data;
-  int r = open_journaler(image_ctx, &journaler, &initialized, &client_meta,
-                         &tag_data);
+  int r = open_journaler(image_ctx, &journaler, &initialized, &client,
+                         &client_meta, &tag_data);
   BOOST_SCOPE_EXIT_ALL(&journaler, &initialized) {
     if (initialized) {
       journaler.shut_down();
@@ -501,29 +538,25 @@ int Journal<I>::allocate_tag(I *image_ctx, const std::string &mirror_uuid) {
     return r;
   }
 
-  // TODO: inject current commit position into tag data
-  tag_data.mirror_uuid = mirror_uuid;
-  tag_data.predecessor_mirror_uuid = mirror_uuid;
+  client_meta.resync_requested = true;
 
-  bufferlist tag_bl;
-  ::encode(tag_data, tag_bl);
+  journal::ClientData client_data(client_meta);
+  bufferlist client_data_bl;
+  ::encode(client_data, client_data_bl);
 
-  C_SaferCond allocate_tag_ctx;
-  cls::journal::Tag tag;
-  journaler.allocate_tag(client_meta.tag_class, tag_bl, &tag,
-                         &allocate_tag_ctx);
+  C_SaferCond update_client_ctx;
+  journaler.update_client(client_data_bl, &update_client_ctx);
 
-  r = allocate_tag_ctx.wait();
+  r = update_client_ctx.wait();
   if (r < 0) {
-    lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+    lderr(cct) << "failed to update client: " << cpp_strerror(r) << dendl;
     return r;
   }
-
   return 0;
 }
 
 template <typename I>
-int Journal<I>::request_resync(I *image_ctx) {
+int Journal<I>::promote(I *image_ctx) {
   CephContext *cct = image_ctx->cct;
   ldout(cct, 20) << __func__ << dendl;
 
@@ -531,10 +564,11 @@ int Journal<I>::request_resync(I *image_ctx) {
                       image_ctx->cct->_conf->rbd_journal_commit_age);
 
   bool initialized;
+  cls::journal::Client client;
   journal::ImageClientMeta client_meta;
   journal::TagData tag_data;
-  int r = open_journaler(image_ctx, &journaler, &initialized, &client_meta,
-                         &tag_data);
+  int r = open_journaler(image_ctx, &journaler, &initialized, &client,
+                         &client_meta, &tag_data);
   BOOST_SCOPE_EXIT_ALL(&journaler, &initialized) {
     if (initialized) {
       journaler.shut_down();
@@ -545,20 +579,13 @@ int Journal<I>::request_resync(I *image_ctx) {
     return r;
   }
 
-  client_meta.resync_requested = true;
-
-  journal::ClientData client_data(client_meta);
-  bufferlist client_data_bl;
-  ::encode(client_data, client_data_bl);
-
-  C_SaferCond update_client_ctx;
-  journaler.update_client(client_data_bl, &update_client_ctx);
-
-  r = update_client_ctx.wait();
+  cls::journal::Tag new_tag;
+  r = allocate_journaler_tag(cct, &journaler, client, client_meta.tag_class,
+                             tag_data, LOCAL_MIRROR_UUID, &new_tag);
   if (r < 0) {
-    lderr(cct) << "failed to update client: " << cpp_strerror(r) << dendl;
     return r;
   }
+
   return 0;
 }
 
@@ -629,22 +656,124 @@ bool Journal<I>::is_tag_owner() const {
 }
 
 template <typename I>
+journal::TagData Journal<I>::get_tag_data() const {
+  return m_tag_data;
+}
+
+template <typename I>
+int Journal<I>::demote() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(m_journaler != nullptr && is_tag_owner());
+
+  cls::journal::Client client;
+  int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client);
+  if (r < 0) {
+    lderr(cct) << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  cls::journal::Tag new_tag;
+  r = allocate_journaler_tag(cct, m_journaler, client, m_tag_class,
+                             m_tag_data, ORPHAN_MIRROR_UUID, &new_tag);
+  if (r < 0) {
+    return r;
+  }
+
+  bufferlist::iterator tag_data_bl_it = new_tag.data.begin();
+  r = C_DecodeTag::decode(&tag_data_bl_it, &m_tag_data);
+  if (r < 0) {
+    lderr(cct) << "failed to decode newly allocated tag" << dendl;
+    return r;
+  }
+
+  journal::EventEntry event_entry{journal::DemoteEvent{}};
+  bufferlist event_entry_bl;
+  ::encode(event_entry, event_entry_bl);
+
+  m_tag_tid = new_tag.tid;
+  Future future = m_journaler->append(m_tag_tid, event_entry_bl);
+  C_SaferCond ctx;
+  future.flush(&ctx);
+
+  r = ctx.wait();
+  if (r < 0) {
+    lderr(cct) << "failed to append demotion journal event: " << cpp_strerror(r)
+               << dendl;
+    return r;
+  }
+
+  m_journaler->committed(future);
+  C_SaferCond flush_ctx;
+  m_journaler->flush_commit_position(&flush_ctx);
+
+  r = flush_ctx.wait();
+  if (r < 0) {
+    lderr(cct) << "failed to flush demotion commit position: "
+               << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+template <typename I>
+void Journal<I>::allocate_local_tag(Context *on_finish) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  bool predecessor_commit_valid = false;
+  uint64_t predecessor_tag_tid = 0;
+  uint64_t predecessor_entry_tid = 0;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_journaler != nullptr && is_tag_owner());
+
+    cls::journal::Client client;
+    int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client);
+    if (r < 0) {
+      lderr(cct) << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+      m_image_ctx.op_work_queue->queue(on_finish, r);
+      return;
+    }
+
+    // since we are primary, populate the predecessor with our known commit
+    // position
+    assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID);
+    if (!client.commit_position.object_positions.empty()) {
+      auto position = client.commit_position.object_positions.front();
+      predecessor_commit_valid = true;
+      predecessor_tag_tid = position.tag_tid;
+      predecessor_entry_tid = position.entry_tid;
+    }
+  }
+
+  allocate_tag(LOCAL_MIRROR_UUID, LOCAL_MIRROR_UUID, predecessor_commit_valid,
+               predecessor_tag_tid, predecessor_entry_tid, on_finish);
+}
+
+template <typename I>
 void Journal<I>::allocate_tag(const std::string &mirror_uuid,
+                              const std::string &predecessor_mirror_uuid,
+                              bool predecessor_commit_valid,
+                              uint64_t predecessor_tag_tid,
+                              uint64_t predecessor_entry_tid,
                               Context *on_finish) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ":  mirror_uuid=" << mirror_uuid
                  << dendl;
 
   Mutex::Locker locker(m_lock);
-  assert(m_journaler != nullptr && is_tag_owner());
+  assert(m_journaler != nullptr);
 
-  // NOTE: currently responsibility of caller to provide local mirror
-  // uuid constant or remote peer uuid
   journal::TagData tag_data;
   tag_data.mirror_uuid = mirror_uuid;
-
-  // TODO: inject current commit position into tag data (need updated journaler PR)
-  tag_data.predecessor_mirror_uuid = m_tag_data.mirror_uuid;
+  tag_data.predecessor_mirror_uuid = predecessor_mirror_uuid;
+  tag_data.predecessor_commit_valid = predecessor_commit_valid;
+  tag_data.predecessor_tag_tid = predecessor_tag_tid;
+  tag_data.predecessor_entry_tid = predecessor_entry_tid;
 
   bufferlist tag_bl;
   ::encode(tag_data, tag_bl);
diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h
index 065b202..424ef0e 100644
--- a/src/librbd/Journal.h
+++ b/src/librbd/Journal.h
@@ -14,6 +14,7 @@
 #include "journal/ReplayEntry.h"
 #include "journal/ReplayHandler.h"
 #include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
 #include <algorithm>
 #include <iosfwd>
 #include <list>
@@ -33,19 +34,7 @@ class AioCompletion;
 class AioObjectRequest;
 class ImageCtx;
 
-namespace journal {
-
-template <typename> class Replay;
-
-template <typename ImageCtxT>
-struct TypeTraits {
-  typedef ::journal::Journaler Journaler;
-  typedef ::journal::Future Future;
-  typedef ::journal::ReplayEntry ReplayEntry;
-};
-
-} // namespace journal
-
+namespace journal { template <typename> class Replay; }
 
 template <typename ImageCtxT = ImageCtx>
 class Journal {
@@ -106,14 +95,15 @@ public:
   static bool is_journal_supported(ImageCtxT &image_ctx);
   static int create(librados::IoCtx &io_ctx, const std::string &image_id,
 		    uint8_t order, uint8_t splay_width,
-		    const std::string &object_pool);
+		    const std::string &object_pool, bool non_primary,
+                    const std::string &primary_mirror_uuid);
   static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
   static int reset(librados::IoCtx &io_ctx, const std::string &image_id);
 
   static int is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner);
   static int get_tag_owner(ImageCtxT *image_ctx, std::string *mirror_uuid);
-  static int allocate_tag(ImageCtxT *image_ctx, const std::string &mirror_uuid);
   static int request_resync(ImageCtxT *image_ctx);
+  static int promote(ImageCtxT *image_ctx);
 
   bool is_journal_ready() const;
   bool is_journal_replaying() const;
@@ -124,7 +114,14 @@ public:
   void close(Context *on_finish);
 
   bool is_tag_owner() const;
-  void allocate_tag(const std::string &mirror_uuid, Context *on_finish);
+  journal::TagData get_tag_data() const;
+  int demote();
+
+  void allocate_local_tag(Context *on_finish);
+  void allocate_tag(const std::string &mirror_uuid,
+                    const std::string &predecessor_mirror_uuid,
+                    bool predecessor_commit_valid, uint64_t predecessor_tag_tid,
+                    uint64_t predecessor_entry_tid, Context *on_finish);
 
   void flush_commit_position(Context *on_finish);
 
diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am
index 2cc5fdf..08c9738 100644
--- a/src/librbd/Makefile.am
+++ b/src/librbd/Makefile.am
@@ -1,5 +1,6 @@
 librbd_types_la_SOURCES = \
 	librbd/journal/Types.cc \
+	librbd/mirroring_watcher/Types.cc \
 	librbd/WatchNotifyTypes.cc
 noinst_LTLIBRARIES += librbd_types.la
 
@@ -25,11 +26,14 @@ librbd_internal_la_SOURCES = \
 	librbd/Journal.cc \
 	librbd/LibrbdAdminSocketHook.cc \
 	librbd/LibrbdWriteback.cc \
+	librbd/MirroringWatcher.cc \
 	librbd/ObjectMap.cc \
+	librbd/ObjectWatcher.cc \
 	librbd/Operations.cc \
 	librbd/Utils.cc \
 	librbd/exclusive_lock/AcquireRequest.cc \
 	librbd/exclusive_lock/ReleaseRequest.cc \
+	librbd/exclusive_lock/StandardPolicy.cc \
 	librbd/image/CloseRequest.cc \
 	librbd/image/OpenRequest.cc \
 	librbd/image/RefreshParentRequest.cc \
@@ -38,6 +42,7 @@ librbd_internal_la_SOURCES = \
 	librbd/image_watcher/Notifier.cc \
 	librbd/image_watcher/NotifyLockOwner.cc \
 	librbd/journal/Replay.cc \
+	librbd/journal/StandardPolicy.cc \
 	librbd/object_map/InvalidateRequest.cc \
 	librbd/object_map/LockRequest.cc \
 	librbd/object_map/Request.cc \
@@ -102,7 +107,9 @@ noinst_HEADERS += \
 	librbd/Journal.h \
 	librbd/LibrbdAdminSocketHook.h \
 	librbd/LibrbdWriteback.h \
+	librbd/MirroringWatcher.h \
 	librbd/ObjectMap.h \
+	librbd/ObjectWatcher.h \
 	librbd/Operations.h \
 	librbd/parent_types.h \
 	librbd/SnapInfo.h \
@@ -110,7 +117,9 @@ noinst_HEADERS += \
 	librbd/Utils.h \
 	librbd/WatchNotifyTypes.h \
 	librbd/exclusive_lock/AcquireRequest.h \
+	librbd/exclusive_lock/Policy.h \
 	librbd/exclusive_lock/ReleaseRequest.h \
+	librbd/exclusive_lock/StandardPolicy.h \
 	librbd/image/CloseRequest.h \
 	librbd/image/OpenRequest.h \
 	librbd/image/RefreshParentRequest.h \
@@ -118,8 +127,12 @@ noinst_HEADERS += \
 	librbd/image/SetSnapRequest.h \
 	librbd/image_watcher/Notifier.h \
 	librbd/image_watcher/NotifyLockOwner.h \
+	librbd/journal/Policy.h \
 	librbd/journal/Replay.h \
+	librbd/journal/StandardPolicy.h \
 	librbd/journal/Types.h \
+	librbd/journal/TypeTraits.h \
+	librbd/mirroring_watcher/Types.h \
 	librbd/object_map/InvalidateRequest.h \
 	librbd/object_map/LockRequest.h \
 	librbd/object_map/Request.h \
diff --git a/src/librbd/MirroringWatcher.cc b/src/librbd/MirroringWatcher.cc
new file mode 100644
index 0000000..c414478
--- /dev/null
+++ b/src/librbd/MirroringWatcher.cc
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/MirroringWatcher.h"
+#include "include/rbd_types.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::MirroringWatcher: "
+
+namespace librbd {
+
+using namespace mirroring_watcher;
+
+namespace {
+
+static const uint64_t NOTIFY_TIMEOUT_MS = 5000;
+
+} // anonymous namespace
+
+template <typename I>
+MirroringWatcher<I>::MirroringWatcher(librados::IoCtx &io_ctx,
+                                      ContextWQT *work_queue)
+  : ObjectWatcher<I>(io_ctx, work_queue) {
+}
+
+template <typename I>
+std::string MirroringWatcher<I>::get_oid() const {
+  return RBD_MIRRORING;
+}
+
+template <typename I>
+int MirroringWatcher<I>::notify_mode_updated(librados::IoCtx &io_ctx,
+                                             cls::rbd::MirrorMode mirror_mode) {
+  CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+  ldout(cct, 20) << dendl;
+
+  bufferlist bl;
+  ::encode(NotifyMessage{ModeUpdatedPayload{mirror_mode}}, bl);
+
+  int r = io_ctx.notify2(RBD_MIRRORING, bl, NOTIFY_TIMEOUT_MS, nullptr);
+  if (r < 0) {
+    lderr(cct) << ": error encountered sending mode updated notification: "
+               << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+template <typename I>
+int MirroringWatcher<I>::notify_image_updated(
+    librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state,
+    const std::string &image_id, const std::string &global_image_id) {
+  CephContext *cct = reinterpret_cast<CephContext*>(io_ctx.cct());
+  ldout(cct, 20) << dendl;
+
+  bufferlist bl;
+  ::encode(NotifyMessage{ImageUpdatedPayload{mirror_image_state, image_id,
+                                             global_image_id}},
+           bl);
+
+  int r = io_ctx.notify2(RBD_MIRRORING, bl, NOTIFY_TIMEOUT_MS, nullptr);
+  if (r < 0) {
+    lderr(cct) << ": error encountered sending image updated notification: "
+               << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+template <typename I>
+void MirroringWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+                                        bufferlist &bl) {
+  CephContext *cct = this->m_cct;
+  ldout(cct, 15) << ": notify_id=" << notify_id << ", "
+                 << "handle=" << handle << dendl;
+
+  Context *ctx = new typename ObjectWatcher<I>::C_NotifyAck(this, notify_id,
+                                                            handle);
+
+  NotifyMessage notify_message;
+  try {
+    bufferlist::iterator iter = bl.begin();
+    ::decode(notify_message, iter);
+  } catch (const buffer::error &err) {
+    lderr(cct) << ": error decoding image notification: " << err.what()
+               << dendl;
+    ctx->complete(0);
+    return;
+  }
+
+  apply_visitor(HandlePayloadVisitor(this, ctx), notify_message.payload);
+}
+
+template <typename I>
+void MirroringWatcher<I>::handle_payload(const ModeUpdatedPayload &payload,
+                                         Context *on_notify_ack) {
+  CephContext *cct = this->m_cct;
+  ldout(cct, 20) << ": mode updated: " << payload.mirror_mode << dendl;
+  handle_mode_updated(payload.mirror_mode, on_notify_ack);
+}
+
+template <typename I>
+void MirroringWatcher<I>::handle_payload(const ImageUpdatedPayload &payload,
+                                         Context *on_notify_ack) {
+  CephContext *cct = this->m_cct;
+  ldout(cct, 20) << ": image state updated" << dendl;
+  handle_image_updated(payload.mirror_image_state, payload.image_id,
+                       payload.global_image_id, on_notify_ack);
+}
+
+template <typename I>
+void MirroringWatcher<I>::handle_payload(const UnknownPayload &payload,
+                                         Context *on_notify_ack) {
+  on_notify_ack->complete(0);
+}
+
+} // namespace librbd
+
+template class librbd::MirroringWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/MirroringWatcher.h b/src/librbd/MirroringWatcher.h
new file mode 100644
index 0000000..f2ec61a
--- /dev/null
+++ b/src/librbd/MirroringWatcher.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_H
+#define CEPH_LIBRBD_MIRRORING_WATCHER_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectWatcher.h"
+#include "librbd/mirroring_watcher/Types.h"
+
+namespace librbd {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MirroringWatcher : public ObjectWatcher<ImageCtxT> {
+public:
+  typedef typename std::decay<decltype(*ImageCtxT::op_work_queue)>::type ContextWQT;
+
+  MirroringWatcher(librados::IoCtx &io_ctx, ContextWQT *work_queue);
+
+  static int notify_mode_updated(librados::IoCtx &io_ctx,
+                                 cls::rbd::MirrorMode mirror_mode);
+  static int notify_image_updated(librados::IoCtx &io_ctx,
+                                  cls::rbd::MirrorImageState mirror_image_state,
+                                  const std::string &image_id,
+                                  const std::string &global_image_id);
+
+  virtual void handle_mode_updated(cls::rbd::MirrorMode mirror_mode,
+                                   Context *on_ack) = 0;
+  virtual void handle_image_updated(cls::rbd::MirrorImageState state,
+                                    const std::string &image_id,
+                                    const std::string &global_image_id,
+                                    Context *on_ack) = 0;
+
+protected:
+  virtual std::string get_oid() const;
+
+  virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+                             bufferlist &bl);
+
+private:
+  struct HandlePayloadVisitor : public boost::static_visitor<void> {
+    MirroringWatcher *mirroring_watcher;
+    Context *on_notify_ack;
+
+    HandlePayloadVisitor(MirroringWatcher *mirroring_watcher,
+                         Context *on_notify_ack)
+      : mirroring_watcher(mirroring_watcher), on_notify_ack(on_notify_ack) {
+    }
+
+    template <typename Payload>
+    inline void operator()(const Payload &payload) const {
+      mirroring_watcher->handle_payload(payload, on_notify_ack);
+    }
+  };
+
+  void handle_payload(const mirroring_watcher::ModeUpdatedPayload &payload,
+                      Context *on_notify_ack);
+  void handle_payload(const mirroring_watcher::ImageUpdatedPayload &payload,
+                      Context *on_notify_ack);
+  void handle_payload(const mirroring_watcher::UnknownPayload &payload,
+                      Context *on_notify_ack);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::MirroringWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIRRORING_WATCHER_H
diff --git a/src/librbd/ObjectWatcher.cc b/src/librbd/ObjectWatcher.cc
new file mode 100644
index 0000000..8bc99f6
--- /dev/null
+++ b/src/librbd/ObjectWatcher.cc
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ObjectWatcher.h"
+#include "include/Context.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ObjectWatcher: " << get_oid() << ": " \
+                           << __func__
+
+namespace librbd {
+
+using util::create_context_callback;
+using util::create_rados_safe_callback;
+
+namespace {
+
+struct C_UnwatchAndFlush : public Context {
+  librados::Rados rados;
+  Context *on_finish;
+  bool flushing = false;
+  int ret_val = 0;
+
+  C_UnwatchAndFlush(librados::IoCtx &io_ctx, Context *on_finish)
+    : rados(io_ctx), on_finish(on_finish) {
+  }
+
+  virtual void complete(int r) override {
+    if (ret_val == 0 && r < 0) {
+      ret_val = r;
+    }
+
+    if (!flushing) {
+      flushing = true;
+
+      librados::AioCompletion *aio_comp = create_rados_safe_callback(this);
+      r = rados.aio_watch_flush(aio_comp);
+      assert(r == 0);
+      aio_comp->release();
+    } else {
+      Context::complete(ret_val);
+    }
+  }
+
+  virtual void finish(int r) override {
+    on_finish->complete(r);
+  }
+};
+
+} // anonymous namespace
+
+template <typename I>
+ObjectWatcher<I>::ObjectWatcher(librados::IoCtx &io_ctx, ContextWQT *work_queue)
+  : m_io_ctx(io_ctx), m_cct(reinterpret_cast<CephContext*>(io_ctx.cct())),
+    m_work_queue(work_queue),
+    m_watch_lock(util::unique_lock_name("librbd::ObjectWatcher::m_watch_lock", this)),
+    m_watch_ctx(this) {
+}
+
+template <typename I>
+ObjectWatcher<I>::~ObjectWatcher() {
+  RWLock::RLocker watch_locker(m_watch_lock);
+  assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+}
+
+template <typename I>
+void ObjectWatcher<I>::register_watch(Context *on_finish) {
+  ldout(m_cct, 5) << dendl;
+
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    assert(on_finish != nullptr);
+    assert(m_on_register_watch == nullptr);
+    assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+
+    m_watch_state = WATCH_STATE_REGISTERING;
+    m_on_register_watch = on_finish;
+  }
+
+  librados::AioCompletion *aio_comp = create_rados_safe_callback<
+    ObjectWatcher<I>, &ObjectWatcher<I>::handle_register_watch>(this);
+  int r = m_io_ctx.aio_watch(get_oid(), aio_comp, &m_watch_handle,
+                             &m_watch_ctx);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_register_watch(int r) {
+  ldout(m_cct, 20) << ": r=" << r << dendl;
+
+  Context *on_register_watch = nullptr;
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_REGISTERING);
+
+    std::swap(on_register_watch, m_on_register_watch);
+    if (r < 0) {
+      lderr(m_cct) << ": failed to register watch: " << cpp_strerror(r)
+                   << dendl;
+
+      m_watch_state = WATCH_STATE_UNREGISTERED;
+      m_watch_handle = 0;
+    } else {
+      m_watch_state = WATCH_STATE_REGISTERED;
+    }
+  }
+  on_register_watch->complete(r);
+}
+
+template <typename I>
+void ObjectWatcher<I>::unregister_watch(Context *on_finish) {
+  ldout(m_cct, 5) << dendl;
+
+  RWLock::WLocker watch_locker(m_watch_lock);
+  assert(on_finish != nullptr);
+  assert(m_on_unregister_watch == nullptr);
+  assert(m_watch_state != WATCH_STATE_UNREGISTERED &&
+         m_watch_state != WATCH_STATE_REGISTERING);
+
+  m_on_unregister_watch = on_finish;
+  if (m_watch_state == WATCH_STATE_REGISTERED) {
+    unregister_watch_();
+  }
+}
+
+template <typename I>
+void ObjectWatcher<I>::unregister_watch_() {
+  assert(m_watch_lock.is_wlocked());
+  assert(m_on_unregister_watch != nullptr);
+  assert(m_watch_state == WATCH_STATE_REGISTERED);
+  m_watch_state = WATCH_STATE_UNREGISTERING;
+
+  Context *ctx = create_context_callback<
+    ObjectWatcher<I>, &ObjectWatcher<I>::handle_unregister_watch>(this);
+  librados::AioCompletion *aio_comp = create_rados_safe_callback(
+      new C_UnwatchAndFlush(m_io_ctx, ctx));
+  int r = m_io_ctx.aio_unwatch(m_watch_handle, aio_comp);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_unregister_watch(int r) {
+  ldout(m_cct, 20) << ": r=" << r << dendl;
+
+  Context *on_unregister_watch = nullptr;
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_UNREGISTERING);
+
+    if (r < 0) {
+      lderr(m_cct) << ": error encountered unregister watch: "
+                   << cpp_strerror(r) << dendl;
+    }
+
+    m_watch_state = WATCH_STATE_UNREGISTERED;
+    m_watch_handle = 0;
+    std::swap(on_unregister_watch, m_on_unregister_watch);
+  }
+
+  on_unregister_watch->complete(r);
+}
+
+template <typename I>
+void ObjectWatcher<I>::pre_unwatch(Context *on_finish) {
+  ldout(m_cct, 20) << dendl;
+
+  on_finish->complete(0);
+}
+
+template <typename I>
+void ObjectWatcher<I>::post_rewatch(Context *on_finish) {
+  ldout(m_cct, 20) << dendl;
+
+  on_finish->complete(0);
+}
+
+template <typename I>
+void ObjectWatcher<I>::acknowledge_notify(uint64_t notify_id, uint64_t handle,
+                                          bufferlist &out) {
+  ldout(m_cct, 15) << ": notify_id=" << notify_id << ", "
+                   << "handle=" << handle << dendl;
+  m_io_ctx.notify_ack(get_oid(), notify_id, handle, out);
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_error(uint64_t handle, int err) {
+  lderr(m_cct) << ": handle=" << handle << ", " << "err=" << err << dendl;
+
+  RWLock::WLocker watch_locker(m_watch_lock);
+  if (m_watch_state != WATCH_STATE_REGISTERED) {
+    return;
+  }
+
+  m_watch_state = WATCH_STATE_REREGISTERING;
+  Context *pre_unwatch_ctx = new FunctionContext([this](int r) {
+      assert(r == 0);
+      Context *ctx = create_context_callback<
+        ObjectWatcher<I>, &ObjectWatcher<I>::handle_pre_unwatch>(this);
+      pre_unwatch(ctx);
+    });
+  m_work_queue->queue(pre_unwatch_ctx, 0);
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_pre_unwatch(int r) {
+  ldout(m_cct, 20) << dendl;
+
+  assert(r == 0);
+  unwatch();
+}
+
+template <typename I>
+void ObjectWatcher<I>::unwatch() {
+  ldout(m_cct, 20) << dendl;
+
+  {
+    RWLock::RLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_REREGISTERING);
+  }
+
+  Context *ctx = create_context_callback<
+    ObjectWatcher<I>, &ObjectWatcher<I>::handle_unwatch>(this);
+  librados::AioCompletion *aio_comp = create_rados_safe_callback(
+    new C_UnwatchAndFlush(m_io_ctx, ctx));
+  int r = m_io_ctx.aio_unwatch(m_watch_handle, aio_comp);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_unwatch(int r) {
+  ldout(m_cct, 20) << ": r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(m_cct) << ": error encountered during unwatch: " << cpp_strerror(r)
+                 << dendl;
+  }
+
+  // handling pending unregister (if any)
+  if (pending_unregister_watch(r)) {
+    return;
+  }
+
+  rewatch();
+}
+
+template <typename I>
+void ObjectWatcher<I>::rewatch() {
+  ldout(m_cct, 20) << dendl;
+
+  {
+    RWLock::RLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_REREGISTERING);
+  }
+
+  librados::AioCompletion *aio_comp = create_rados_safe_callback<
+    ObjectWatcher<I>, &ObjectWatcher<I>::handle_rewatch>(this);
+  int r = m_io_ctx.aio_watch(get_oid(), aio_comp, &m_watch_handle,
+                                       &m_watch_ctx);
+  assert(r == 0);
+  aio_comp->release();
+
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_rewatch(int r) {
+  ldout(m_cct, 20) << ": r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(m_cct) << ": error encountered during re-watch: " << cpp_strerror(r)
+                 << dendl;
+    m_watch_handle = 0;
+
+    if (!pending_unregister_watch(0)) {
+      rewatch();
+    }
+    return;
+  }
+
+  Context *ctx = create_context_callback<
+    ObjectWatcher<I>, &ObjectWatcher<I>::handle_post_watch>(this);
+  post_rewatch(ctx);
+}
+
+template <typename I>
+void ObjectWatcher<I>::handle_post_watch(int r) {
+  ldout(m_cct, 20) << dendl;
+
+  assert(r == 0);
+
+  RWLock::WLocker watch_locker(m_watch_lock);
+  m_watch_state = WATCH_STATE_REGISTERED;
+
+  // handling pending unregister (if any)
+  if (m_on_unregister_watch != nullptr) {
+    unregister_watch_();
+    return;
+  }
+}
+
+template <typename I>
+bool ObjectWatcher<I>::pending_unregister_watch(int r) {
+  Context *on_unregister_watch = nullptr;
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    assert(m_watch_state == WATCH_STATE_REREGISTERING);
+
+    if (m_on_unregister_watch != nullptr) {
+      m_watch_state = WATCH_STATE_UNREGISTERED;
+      std::swap(on_unregister_watch, m_on_unregister_watch);
+    }
+  }
+
+  if (on_unregister_watch != nullptr) {
+    on_unregister_watch->complete(r);
+    return true;
+  }
+
+  return false;
+}
+
+template <typename I>
+ObjectWatcher<I>::C_NotifyAck::C_NotifyAck(ObjectWatcher *object_watcher,
+                                           uint64_t notify_id, uint64_t handle)
+  : object_watcher(object_watcher), notify_id(notify_id), handle(handle) {
+  CephContext *cct = object_watcher->m_cct;
+  ldout(cct, 10) << ": C_NotifyAck start: id=" << notify_id << ", "
+                 << "handle=" << handle << dendl;
+}
+
+template <typename I>
+void ObjectWatcher<I>::C_NotifyAck::finish(int r) {
+  assert(r == 0);
+  CephContext *cct = object_watcher->m_cct;
+  ldout(cct, 10) << ": C_NotifyAck finish: id=" << notify_id << ", "
+                 << "handle=" << handle << dendl;
+  object_watcher->acknowledge_notify(notify_id, handle, out);
+}
+
+} // namespace librbd
+
+template class librbd::ObjectWatcher<librbd::ImageCtx>;
diff --git a/src/librbd/ObjectWatcher.h b/src/librbd/ObjectWatcher.h
new file mode 100644
index 0000000..5ba5c80
--- /dev/null
+++ b/src/librbd/ObjectWatcher.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_WATCHER_H
+#define CEPH_LIBRBD_OBJECT_WATCHER_H
+
+#include "include/rados/librados.hpp"
+#include "common/RWLock.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+#include <type_traits>
+
+class Context;
+
+namespace librbd {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectWatcher {
+public:
+  typedef typename std::decay<decltype(*ImageCtxT::op_work_queue)>::type ContextWQT;
+
+  ObjectWatcher(librados::IoCtx &io_ctx, ContextWQT *work_queue);
+  virtual ~ObjectWatcher();
+
+  ObjectWatcher(const ObjectWatcher&) = delete;
+  ObjectWatcher& operator= (const ObjectWatcher&) = delete;
+
+  void register_watch(Context *on_finish);
+  virtual void unregister_watch(Context *on_finish);
+
+protected:
+  struct C_NotifyAck : public Context {
+    ObjectWatcher *object_watcher;
+    uint64_t notify_id;
+    uint64_t handle;
+    bufferlist out;
+
+    C_NotifyAck(ObjectWatcher *object_watcher, uint64_t notify_id,
+                uint64_t handle);
+    virtual void finish(int r);
+
+    std::string get_oid() const {
+      return object_watcher->get_oid();
+    }
+  };
+
+  librados::IoCtx &m_io_ctx;
+  CephContext *m_cct;
+
+  virtual std::string get_oid() const = 0;
+
+  virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+                             bufferlist &bl) = 0;
+  void acknowledge_notify(uint64_t notify_id, uint64_t handle, bufferlist &out);
+
+  virtual void pre_unwatch(Context *on_finish);
+  virtual void post_rewatch(Context *on_finish);
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * REGISTER_WATCH
+   *    |
+   *    |   /-------------------------------------\
+   *    |   |                                     |
+   *    v   v   (watch error)                     |
+   * REGISTERED * * * * * * * > PRE_UNWATCH       |
+   *    |                         |               |
+   *    |                         v               |
+   *    |                       UNWATCH           |
+   *    |                         |               |
+   *    |                         v               |
+   *    |                       REWATCH           |
+   *    |                         |               |
+   *    |                         v               |
+   *    |                       POST_REWATCH      |
+   *    |                         |               |
+   *    v                         \---------------/
+   * UNREGISTER_WATCH
+   *    |
+   *    v
+   * UNREGISTERED
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   */
+
+  struct WatchCtx : public librados::WatchCtx2 {
+    ObjectWatcher *object_watcher;
+
+    WatchCtx(ObjectWatcher *object_watcher) : object_watcher(object_watcher) {
+    }
+
+    virtual void handle_notify(uint64_t notify_id,
+                               uint64_t handle,
+                               uint64_t notifier_id,
+                               bufferlist& bl) {
+      object_watcher->handle_notify(notify_id, handle, bl);
+    }
+
+    virtual void handle_error(uint64_t handle, int err) {
+      object_watcher->handle_error(handle, err);
+    }
+  };
+
+  enum WatchState {
+    WATCH_STATE_UNREGISTERED,
+    WATCH_STATE_REGISTERING,
+    WATCH_STATE_REGISTERED,
+    WATCH_STATE_UNREGISTERING,
+    WATCH_STATE_REREGISTERING
+  };
+
+  ContextWQT* m_work_queue;
+
+  mutable RWLock m_watch_lock;
+  WatchCtx m_watch_ctx;
+  uint64_t m_watch_handle = 0;
+  WatchState m_watch_state = WATCH_STATE_UNREGISTERED;
+
+  Context *m_on_register_watch = nullptr;
+  Context *m_on_unregister_watch = nullptr;
+
+  void handle_register_watch(int r);
+
+  void unregister_watch_();
+  void handle_unregister_watch(int r);
+
+  void handle_error(uint64_t handle, int err);
+
+  void handle_pre_unwatch(int r);
+
+  void unwatch();
+  void handle_unwatch(int r);
+
+  void rewatch();
+  void handle_rewatch(int r);
+
+  void handle_post_watch(int r);
+
+  bool pending_unregister_watch(int r);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ObjectWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_WATCHER_H
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
index a40cdc7..65c334a 100644
--- a/src/librbd/WatchNotifyTypes.cc
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -131,18 +131,23 @@ void ReleasedLockPayload::dump(Formatter *f) const {
 
 void RequestLockPayload::encode(bufferlist &bl) const {
   ::encode(client_id, bl);
+  ::encode(force, bl);
 }
 
 void RequestLockPayload::decode(__u8 version, bufferlist::iterator &iter) {
   if (version >= 2) {
     ::decode(client_id, iter);
   }
+  if (version >= 3) {
+    ::decode(force, iter);
+  }
 }
 
 void RequestLockPayload::dump(Formatter *f) const {
   f->open_object_section("client_id");
   client_id.dump(f);
   f->close_section();
+  f->dump_bool("force", force);
 }
 
 void HeaderUpdatePayload::encode(bufferlist &bl) const {
@@ -270,7 +275,7 @@ bool NotifyMessage::check_for_refresh() const {
 }
 
 void NotifyMessage::encode(bufferlist& bl) const {
-  ENCODE_START(2, 1, bl);
+  ENCODE_START(3, 1, bl);
   boost::apply_visitor(EncodePayloadVisitor(bl), payload);
   ENCODE_FINISH(bl);
 }
@@ -344,7 +349,7 @@ void NotifyMessage::dump(Formatter *f) const {
 void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
   o.push_back(new NotifyMessage(AcquiredLockPayload(ClientId(1, 2))));
   o.push_back(new NotifyMessage(ReleasedLockPayload(ClientId(1, 2))));
-  o.push_back(new NotifyMessage(RequestLockPayload(ClientId(1, 2))));
+  o.push_back(new NotifyMessage(RequestLockPayload(ClientId(1, 2), true)));
   o.push_back(new NotifyMessage(HeaderUpdatePayload()));
   o.push_back(new NotifyMessage(AsyncProgressPayload(AsyncRequestId(ClientId(0, 1), 2), 3, 4)));
   o.push_back(new NotifyMessage(AsyncCompletePayload(AsyncRequestId(ClientId(0, 1), 2), 3)));
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
index a587b23..813ff5f 100644
--- a/src/librbd/WatchNotifyTypes.h
+++ b/src/librbd/WatchNotifyTypes.h
@@ -123,9 +123,12 @@ struct RequestLockPayload {
   static const bool CHECK_FOR_REFRESH = true;
 
   ClientId client_id;
+  bool force = false;
 
   RequestLockPayload() {}
-  RequestLockPayload(const ClientId &client_id_) : client_id(client_id_) {}
+  RequestLockPayload(const ClientId &client_id_, bool force_)
+    : client_id(client_id_), force(force_) {
+  }
 
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
diff --git a/src/librbd/exclusive_lock/AcquireRequest.cc b/src/librbd/exclusive_lock/AcquireRequest.cc
index d973bf2..6ec148e 100644
--- a/src/librbd/exclusive_lock/AcquireRequest.cc
+++ b/src/librbd/exclusive_lock/AcquireRequest.cc
@@ -14,6 +14,7 @@
 #include "librbd/Journal.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/Utils.h"
+#include "librbd/journal/Policy.h"
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -151,7 +152,7 @@ Context *AcquireRequest<I>::send_open_journal() {
     this);
   m_journal = m_image_ctx.create_journal();
 
-  // journal playback required object map (if enabled) and itself
+  // journal playback requires object map (if enabled) and itself
   apply();
 
   m_journal->open(ctx);
@@ -179,17 +180,11 @@ void AcquireRequest<I>::send_allocate_journal_tag() {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 10) << __func__ << dendl;
 
-  if (!m_journal->is_tag_owner()) {
-    lderr(cct) << "local image not promoted" << dendl;
-    m_error_result = -EPERM;
-    send_close_journal();
-    return;
-  }
-
+  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
   using klass = AcquireRequest<I>;
   Context *ctx = create_context_callback<
     klass, &klass::handle_allocate_journal_tag>(this);
-  m_journal->allocate_tag(Journal<I>::LOCAL_MIRROR_UUID, ctx);
+  m_image_ctx.get_journal_policy()->allocate_tag_on_lock(ctx);
 }
 
 template <typename I>
@@ -198,6 +193,8 @@ Context *AcquireRequest<I>::handle_allocate_journal_tag(int *ret_val) {
   ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
 
   if (*ret_val < 0) {
+    lderr(cct) << "failed to allocate journal tag: " << cpp_strerror(*ret_val)
+               << dendl;
     m_error_result = *ret_val;
     send_close_journal();
     return nullptr;
diff --git a/src/librbd/exclusive_lock/Policy.h b/src/librbd/exclusive_lock/Policy.h
new file mode 100644
index 0000000..2ff8418
--- /dev/null
+++ b/src/librbd/exclusive_lock/Policy.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
+
+namespace librbd {
+namespace exclusive_lock {
+
+struct Policy {
+  virtual ~Policy() {
+  }
+
+  virtual void lock_requested(bool force) = 0;
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H
diff --git a/src/librbd/exclusive_lock/StandardPolicy.cc b/src/librbd/exclusive_lock/StandardPolicy.cc
new file mode 100644
index 0000000..22f0434
--- /dev/null
+++ b/src/librbd/exclusive_lock/StandardPolicy.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/StandardPolicy.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ExclusiveLock.h"
+
+namespace librbd {
+namespace exclusive_lock {
+
+void StandardPolicy::lock_requested(bool force) {
+  assert(m_image_ctx->owner_lock.is_locked());
+  assert(m_image_ctx->exclusive_lock != nullptr);
+
+  // release the lock upon request (ignore forced requests)
+  m_image_ctx->exclusive_lock->release_lock(nullptr);
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
diff --git a/src/librbd/exclusive_lock/StandardPolicy.h b/src/librbd/exclusive_lock/StandardPolicy.h
new file mode 100644
index 0000000..ddc78cc
--- /dev/null
+++ b/src/librbd/exclusive_lock/StandardPolicy.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
+
+#include "librbd/exclusive_lock/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace exclusive_lock {
+
+class StandardPolicy : public Policy{
+public:
+  StandardPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) {
+  }
+
+  virtual void lock_requested(bool force);
+
+private:
+  ImageCtx *m_image_ctx;
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 73ebb0a..6ace24f 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -35,6 +35,7 @@
 #include "librbd/internal.h"
 #include "librbd/Journal.h"
 #include "librbd/journal/Types.h"
+#include "librbd/MirroringWatcher.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/Operations.h"
 #include "librbd/parent_types.h"
@@ -193,13 +194,215 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
       << dendl;
     return r;
   } else if (mirror_image_internal.state !=
-               cls::rbd::MirrorImageState::MIRROR_IMAGE_STATE_ENABLED) {
+               cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
     lderr(cct) << "mirroring is not currently enabled" << dendl;
     return -EINVAL;
   }
   return 0;
 }
 
+int mirror_image_enable(CephContext *cct, librados::IoCtx &io_ctx,
+                        const std::string &id,
+                        const std::string &global_image_id) {
+  cls::rbd::MirrorImage mirror_image_internal;
+  int r = cls_client::mirror_image_get(&io_ctx, id, &mirror_image_internal);
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (mirror_image_internal.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+    // mirroring is already enabled
+    return 0;
+  } else if (r != -ENOENT) {
+    lderr(cct) << "cannot enable mirroring: currently disabling" << dendl;
+    return -EINVAL;
+  }
+
+  mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+  if (global_image_id.empty()) {
+    uuid_d uuid_gen;
+    uuid_gen.generate_random();
+    mirror_image_internal.global_image_id = uuid_gen.to_string();
+  } else {
+    mirror_image_internal.global_image_id = global_image_id;
+  }
+
+  r = cls_client::mirror_image_set(&io_ctx, id, mirror_image_internal);
+  if (r < 0) {
+    lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  r = MirroringWatcher<>::notify_image_updated(
+    io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED, id,
+    mirror_image_internal.global_image_id);
+  if (r < 0) {
+    lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+               << dendl;
+    return r;
+  }
+
+  ldout(cct, 20) << "image mirroring is enabled: global_id=" <<
+    mirror_image_internal.global_image_id << dendl;
+
+  return 0;
+}
+
+int mirror_image_enable_internal(ImageCtx *ictx) {
+  CephContext *cct = ictx->cct;
+
+  if ((ictx->features & RBD_FEATURE_JOURNALING) == 0) {
+    lderr(cct) << "cannot enable mirroring: journaling is not enabled"
+      << dendl;
+    return -EINVAL;
+  }
+
+  bool is_primary;
+  int r = Journal<>::is_tag_owner(ictx, &is_primary);
+  if (r < 0) {
+    lderr(cct) << "cannot enable mirroring: failed to check tag ownership: "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (!is_primary) {
+    lderr(cct) <<
+      "cannot enable mirroring: last journal tag not owned by local cluster"
+      << dendl;
+    return -EINVAL;
+  }
+
+  r = mirror_image_enable(cct, ictx->md_ctx, ictx->id, "");
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int mirror_image_disable_internal(ImageCtx *ictx, bool force) {
+  CephContext *cct = ictx->cct;
+
+  cls::rbd::MirrorImage mirror_image_internal;
+  std::vector<snap_info_t> snaps;
+  std::set<cls::journal::Client> clients;
+  std::string header_oid;
+
+  int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
+      &mirror_image_internal);
+  if (r == -ENOENT) {
+    // mirroring is not enabled for this image
+    ldout(cct, 20) << "ignoring disable command: mirroring is not enabled "
+      "for this image" << dendl;
+    return 0;
+  } else if (r == -EOPNOTSUPP) {
+    ldout(cct, 5) << "mirroring not supported by OSD" << dendl;
+    return r;
+  } else if (r < 0) {
+    lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  bool is_primary;
+  r = Journal<>::is_tag_owner(ictx, &is_primary);
+  if (r < 0) {
+    lderr(cct) << "cannot disable mirroring: failed to check tag ownership: "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (!is_primary) {
+    if (!force) {
+      lderr(cct) << "Mirrored image is not the primary, add force option to"
+        " disable mirroring" << dendl;
+      return -EINVAL;
+    }
+    goto remove_mirroring_image;
+  }
+
+  mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
+  r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
+      mirror_image_internal);
+  if (r < 0) {
+    lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  r = MirroringWatcher<>::notify_image_updated(
+    ictx->md_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLING,
+    ictx->id, mirror_image_internal.global_image_id);
+  if (r < 0) {
+    lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+               << dendl;
+    return r;
+  }
+
+  header_oid = ::journal::Journaler::header_oid(ictx->id);
+
+  while(true) {
+    r = cls::journal::client::client_list(ictx->md_ctx, header_oid, &clients);
+    if (r < 0) {
+      lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    assert(clients.size() >= 1);
+
+    if (clients.size() == 1) {
+      // only local journal client remains
+      break;
+    }
+
+    for (auto client : clients) {
+      journal::ClientData client_data;
+      bufferlist::iterator bl = client.data.begin();
+      ::decode(client_data, bl);
+      journal::ClientMetaType type = client_data.get_client_meta_type();
+
+      if (type != journal::ClientMetaType::MIRROR_PEER_CLIENT_META_TYPE) {
+        continue;
+      }
+
+      journal::MirrorPeerClientMeta client_meta =
+        boost::get<journal::MirrorPeerClientMeta>(client_data.client_meta);
+
+      for (const auto& sync : client_meta.sync_points) {
+        r = ictx->operations->snap_remove(sync.snap_name.c_str());
+        if (r < 0 && r != -ENOENT) {
+          lderr(cct) << "cannot disable mirroring: failed to remove temporary"
+            " snapshot created by remote peer: " << cpp_strerror(r) << dendl;
+          return r;
+        }
+      }
+
+      r = cls::journal::client::client_unregister(ictx->md_ctx, header_oid,
+          client.id);
+      if (r < 0 && r != -ENOENT) {
+        lderr(cct) << "cannot disable mirroring: failed to unregister remote"
+          " journal client: " << cpp_strerror(r) << dendl;
+        return r;
+      }
+    }
+  }
+
+remove_mirroring_image:
+  r = cls_client::mirror_image_remove(&ictx->md_ctx, ictx->id);
+  if (r < 0) {
+    lderr(cct) << "failed to remove image from mirroring directory: "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  ldout(cct, 20) << "removed image state from rbd_mirroring object" << dendl;
+
+  if (is_primary) {
+    // TODO: send notification to mirroring object about update
+  }
+
+  return 0;
+}
+
 } // anonymous namespace
 
   int detect_format(IoCtx &io_ctx, const string &name,
@@ -580,6 +783,38 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     return (*opts_)->empty();
   }
 
+  int list_images_v2(IoCtx& io_ctx, map<string, string> &images) {
+    CephContext *cct = (CephContext *)io_ctx.cct();
+    ldout(cct, 20) << "list_images_v2 " << &io_ctx << dendl;
+
+    // new format images are accessed by class methods
+    int r;
+    int max_read = 1024;
+    string last_read = "";
+    do {
+      map<string, string> images_page;
+      r = cls_client::dir_list(&io_ctx, RBD_DIRECTORY,
+			   last_read, max_read, &images_page);
+      if (r < 0 && r != -ENOENT) {
+        lderr(cct) << "error listing image in directory: "
+                   << cpp_strerror(r) << dendl;
+        return r;
+      } else if (r == -ENOENT) {
+        break;
+      }
+      for (map<string, string>::const_iterator it = images_page.begin();
+	   it != images_page.end(); ++it) {
+	images.insert(*it);
+      }
+      if (!images_page.empty()) {
+	last_read = images_page.rbegin()->first;
+      }
+      r = images_page.size();
+    } while (r == max_read);
+
+    return 0;
+  }
+
   int list(IoCtx& io_ctx, vector<string>& names)
   {
     CephContext *cct = (CephContext *)io_ctx.cct();
@@ -602,27 +837,15 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
       }
     }
 
-    // new format images are accessed by class methods
-    int max_read = 1024;
-    string last_read = "";
-    do {
-      map<string, string> images;
-      r = cls_client::dir_list(&io_ctx, RBD_DIRECTORY,
-			   last_read, max_read, &images);
-      if (r < 0) {
-        lderr(cct) << "error listing image in directory: " 
-                   << cpp_strerror(r) << dendl;   
-        return r;
-      }
-      for (map<string, string>::const_iterator it = images.begin();
-	   it != images.end(); ++it) {
-	names.push_back(it->first);
-      }
-      if (!images.empty()) {
-	last_read = images.rbegin()->first;
-      }
-      r = images.size();
-    } while (r == max_read);
+    map<string, string> images;
+    r = list_images_v2(io_ctx, images);
+    if (r < 0) {
+      lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    for (const auto& img_pair : images) {
+      names.push_back(img_pair.first);
+    }
 
     return 0;
   }
@@ -774,8 +997,9 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
   int create_v2(IoCtx& io_ctx, const char *imgname, uint64_t bid, uint64_t size,
 		int order, uint64_t features, uint64_t stripe_unit,
 		uint64_t stripe_count, uint8_t journal_order,
-		uint8_t journal_splay_width,
-		const std::string &journal_pool)
+                uint8_t journal_splay_width, const std::string &journal_pool,
+                const std::string &non_primary_global_image_id,
+                const std::string &primary_mirror_uuid)
   {
     ostringstream bid_ss;
     uint32_t extra;
@@ -785,6 +1009,7 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     CephContext *cct = (CephContext *)io_ctx.cct();
 
     file_layout_t layout;
+    bool force_non_primary = !non_primary_global_image_id.empty();
 
     int r = validate_pool(io_ctx, cct);
     if (r < 0) {
@@ -874,44 +1099,47 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
         goto err_remove_object_map;
       }
 
-      r = Journal<>::create(io_ctx, id, journal_order, journal_splay_width,
-			    journal_pool);
+      rbd_mirror_mode_t mirror_mode;
+      r = librbd::mirror_mode_get(io_ctx, &mirror_mode);
       if (r < 0) {
-        lderr(cct) << "error creating journal: " << cpp_strerror(r) << dendl;
+        lderr(cct) << "error in retrieving pool mirroring status: "
+                   << cpp_strerror(r) << dendl;
         goto err_remove_object_map;
       }
 
-      rbd_mirror_mode_t mirror_mode;
-      r = librbd::mirror_mode_get(io_ctx, &mirror_mode);
+      r = Journal<>::create(io_ctx, id, journal_order, journal_splay_width,
+			    journal_pool, force_non_primary,
+                            primary_mirror_uuid);
       if (r < 0) {
-        lderr(cct) << "error in retrieving pool mirroring status: "
-          << cpp_strerror(r) << dendl;
+        lderr(cct) << "error creating journal: " << cpp_strerror(r) << dendl;
         goto err_remove_object_map;
       }
 
-      if (mirror_mode == RBD_MIRROR_MODE_POOL) {
-        ImageCtx *img_ctx = new ImageCtx("", id, nullptr, io_ctx, false);
-        r = img_ctx->state->open();
-        if (r < 0) {
-          lderr(cct) << "error opening image: " << cpp_strerror(r) << dendl;
-          delete img_ctx;
-          goto err_remove_object_map;
-        }
-        r = mirror_image_enable(img_ctx);
+      if (mirror_mode == RBD_MIRROR_MODE_POOL || force_non_primary) {
+        r = mirror_image_enable(cct, io_ctx, id, non_primary_global_image_id);
         if (r < 0) {
           lderr(cct) << "error enabling mirroring: " << cpp_strerror(r)
-            << dendl;
-          img_ctx->state->close();
-          goto err_remove_object_map;
+                     << dendl;
+          goto err_remove_journal;
         }
-        img_ctx->state->close();
       }
-
+    } else if (force_non_primary) {
+      // journaling should have been enabled
+      assert(false);
     }
 
     ldout(cct, 2) << "done." << dendl;
     return 0;
 
+  err_remove_journal:
+    if ((features & RBD_FEATURE_JOURNALING) != 0) {
+      remove_r = Journal<>::remove(io_ctx, id);
+      if (remove_r < 0) {
+        lderr(cct) << "error cleaning up journal after creation failed: "
+                   << cpp_strerror(remove_r) << dendl;
+      }
+    }
+
   err_remove_object_map:
     if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
       remove_r = ObjectMap::remove(io_ctx, id);
@@ -1079,7 +1307,8 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
       opts.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &journal_pool);
 
       r = create_v2(io_ctx, imgname, bid, size, order, features, stripe_unit,
-		    stripe_count, journal_order, journal_splay_width, journal_pool);
+		    stripe_count, journal_order, journal_splay_width,
+                    journal_pool, "", "");
     }
 
     int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
@@ -1422,7 +1651,7 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
 
           r = Journal<>::create(ictx->md_ctx, ictx->id, ictx->journal_order,
   			        ictx->journal_splay_width,
-  			        ictx->journal_pool);
+  			        ictx->journal_pool, false, "");
           if (r < 0) {
             lderr(cct) << "error creating image journal: " << cpp_strerror(r)
                        << dendl;
@@ -1490,14 +1719,13 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
                 << cpp_strerror(r) << dendl;
             }
 
-            if (mirror_image.state ==
-                cls::rbd::MirrorImageState::MIRROR_IMAGE_STATE_ENABLED) {
+            if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
               lderr(cct) << "cannot disable journaling: image mirroring "
                 " enabled and mirror pool mode set to image" << dendl;
               return -EINVAL;
             }
           } else if (mirror_mode == RBD_MIRROR_MODE_POOL) {
-            r = mirror_image_disable(ictx, false);
+            r = mirror_image_disable_internal(ictx, false);
             if (r < 0) {
               lderr(cct) << "error disabling image mirroring: "
                 << cpp_strerror(r) << dendl;
@@ -1546,7 +1774,7 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
           lderr(cct) << "error opening image: " << cpp_strerror(r) << dendl;
           delete img_ctx;
         } else {
-          r = mirror_image_enable(img_ctx);
+          r = mirror_image_enable_internal(img_ctx);
           if (r < 0) {
             lderr(cct) << "error enabling mirroring: " << cpp_strerror(r)
               << dendl;
@@ -1733,6 +1961,17 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
 	return r;
       }
 
+      if (!old_format) {
+        r = mirror_image_disable_internal(ictx, false);
+        if (r < 0 && r != -EOPNOTSUPP) {
+          lderr(cct) << "error disabling image mirroring: " << cpp_strerror(r)
+                     << dendl;
+          ictx->owner_lock.put_read();
+          ictx->state->close();
+          return r;
+        }
+      }
+
       ictx->owner_lock.put_read();
       ictx->state->close();
 
@@ -2413,63 +2652,24 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "mirror_image_enable " << ictx << dendl;
 
-    if ((ictx->features & RBD_FEATURE_JOURNALING) == 0) {
-      lderr(cct) << "cannot enable mirroring: journaling is not enabled"
-        << dendl;
-      return -EINVAL;
-    }
-
-    bool is_primary;
-    int r = Journal<>::is_tag_owner(ictx, &is_primary);
+    cls::rbd::MirrorMode mirror_mode;
+    int r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode);
     if (r < 0) {
-      lderr(cct) << "cannot enable mirroring: failed to check tag ownership: "
+      lderr(cct) << "cannot enable mirroring: failed to retrieve mirror mode: "
         << cpp_strerror(r) << dendl;
       return r;
     }
 
-    if (!is_primary) {
-      lderr(cct) <<
-        "cannot enable mirroring: last journal tag not owned by local cluster"
-        << dendl;
+    if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+      lderr(cct) << "cannot enable mirroring in the current pool mirroring "
+        "mode" << dendl;
       return -EINVAL;
     }
 
-    cls::rbd::MirrorImage mirror_image_internal;
-    r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
-                                 &mirror_image_internal);
-    if (r < 0 && r != -ENOENT) {
-      lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    if (mirror_image_internal.state ==
-        cls::rbd::MirrorImageState::MIRROR_IMAGE_STATE_ENABLED) {
-      // mirroring is already enabled
-      return 0;
-    }
-    else if (r != -ENOENT) {
-      lderr(cct) << "cannot enable mirroring: mirroring image is in "
-        "disabling state" << dendl;
-      return -EINVAL;
-    }
-
-    mirror_image_internal.state =
-      cls::rbd::MirrorImageState::MIRROR_IMAGE_STATE_ENABLED;
-
-    uuid_d uuid_gen;
-    uuid_gen.generate_random();
-    mirror_image_internal.global_image_id = uuid_gen.to_string();
-
-    r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
-                                     mirror_image_internal);
+    r = mirror_image_enable_internal(ictx);
     if (r < 0) {
-      lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl;
       return r;
     }
-
-    ldout(cct, 20) << "image mirroring is enabled: global_id=" <<
-      mirror_image_internal.global_image_id << dendl;
-
     return 0;
   }
 
@@ -2477,112 +2677,24 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "mirror_image_disable " << ictx << dendl;
 
-    cls::rbd::MirrorImage mirror_image_internal;
-    std::vector<snap_info_t> snaps;
-    std::set<cls::journal::Client> clients;
-    std::string header_oid;
-
-    bool is_primary;
-    int r = Journal<>::is_tag_owner(ictx, &is_primary);
-    if (r < 0) {
-      lderr(cct) << "cannot disable mirroring: failed to check tag ownership: "
-        << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    if (!is_primary) {
-      if (!force) {
-        lderr(cct) << "Mirrored image is not the primary, add force option to"
-          " disable mirroring" << dendl;
-        return -EINVAL;
-      }
-      goto remove_mirroring_image;
-    }
-
-    r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id,
-                                     &mirror_image_internal);
-    if (r < 0 && r != -ENOENT) {
-      lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
-      return r;
-    }
-    else if (r == -ENOENT) {
-      // mirroring is not enabled for this image
-      ldout(cct, 20) << "ignoring disable command: mirroring is not enabled "
-        "for this image" << dendl;
-      return 0;
-    }
-
-    mirror_image_internal.state =
-      cls::rbd::MirrorImageState::MIRROR_IMAGE_STATE_DISABLING;
-    r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id,
-                                     mirror_image_internal);
+    cls::rbd::MirrorMode mirror_mode;
+    int r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode);
     if (r < 0) {
-      lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
+      lderr(cct) << "cannot disable mirroring: failed to retrieve pool "
+        "mirroring mode: " << cpp_strerror(r) << dendl;
       return r;
     }
 
-    header_oid = ::journal::Journaler::header_oid(ictx->id);
-
-    while(true) {
-      r = cls::journal::client::client_list(ictx->md_ctx, header_oid, &clients);
-      if (r < 0) {
-        lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
-        return r;
-      }
-
-      assert(clients.size() >= 1);
-
-      if (clients.size() == 1) {
-        // only local journal client remains
-        break;
-      }
-
-      for (auto client : clients) {
-        journal::ClientData client_data;
-        bufferlist::iterator bl = client.data.begin();
-        ::decode(client_data, bl);
-        journal::ClientMetaType type = client_data.get_client_meta_type();
-
-        if (type != journal::ClientMetaType::MIRROR_PEER_CLIENT_META_TYPE) {
-          continue;
-        }
-
-        journal::MirrorPeerClientMeta client_meta =
-          boost::get<journal::MirrorPeerClientMeta>(client_data.client_meta);
-
-        for (const auto& sync : client_meta.sync_points) {
-          r = ictx->operations->snap_remove(sync.snap_name.c_str());
-          if (r < 0 && r != -ENOENT) {
-            lderr(cct) << "cannot disable mirroring: failed to remove temporary"
-              " snapshot created by remote peer: " << cpp_strerror(r) << dendl;
-            return r;
-          }
-        }
-
-        r = cls::journal::client::client_unregister(ictx->md_ctx, header_oid,
-                                                    client.id);
-        if (r < 0 && r != -ENOENT) {
-          lderr(cct) << "cannot disable mirroring: failed to unregister remote"
-            " journal client: " << cpp_strerror(r) << dendl;
-          return r;
-        }
-      }
+    if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+      lderr(cct) << "cannot disable mirroring in the current pool mirroring "
+        "mode" << dendl;
+      return -EINVAL;
     }
 
-  remove_mirroring_image:
-    r = cls_client::mirror_image_remove(&ictx->md_ctx, ictx->id);
+    r = mirror_image_disable_internal(ictx, force);
     if (r < 0) {
-      lderr(cct) << "failed to remove image from mirroring directory: "
-        << cpp_strerror(r) << dendl;
       return r;
     }
-
-    ldout(cct, 20) << "removed image state from rbd_mirroring object" << dendl;
-
-    if (is_primary) {
-      // TODO: send notification to mirroring object about update
-    }
-
     return 0;
   }
 
@@ -2613,7 +2725,7 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     // TODO: need interlock with local rbd-mirror daemon to ensure it has stopped
     //       replay
 
-    r = Journal<>::allocate_tag(ictx, Journal<>::LOCAL_MIRROR_UUID);
+    r = Journal<>::promote(ictx);
     if (r < 0) {
       lderr(cct) << "failed to promote image: " << cpp_strerror(r)
                  << dendl;
@@ -2645,16 +2757,32 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     }
 
     RWLock::RLocker owner_lock(ictx->owner_lock);
+    if (ictx->exclusive_lock == nullptr) {
+      lderr(cct) << "exclusive lock is not active" << dendl;
+      return -EINVAL;
+    }
+
     C_SaferCond lock_ctx;
     ictx->exclusive_lock->request_lock(&lock_ctx);
     r = lock_ctx.wait();
     if (r < 0) {
       lderr(cct) << "failed to lock image: " << cpp_strerror(r) << dendl;
+      return r;
     } else if (!ictx->exclusive_lock->is_lock_owner()) {
       lderr(cct) << "failed to acquire exclusive lock" << dendl;
+      return -EROFS;
     }
 
-    r = Journal<>::allocate_tag(ictx, Journal<>::ORPHAN_MIRROR_UUID);
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    if (ictx->journal == nullptr) {
+      lderr(cct) << "journal is not active" << dendl;
+      return -EINVAL;
+    } else if (!ictx->journal->is_tag_owner()) {
+      lderr(cct) << "image is not currently the primary" << dendl;
+      return -EINVAL;
+    }
+
+    r = ictx->journal->demote();
     if (r < 0) {
       lderr(cct) << "failed to demote image: " << cpp_strerror(r)
                  << dendl;
@@ -2759,6 +2887,34 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
     return 0;
   }
 
+  int list_mirror_images(IoCtx& io_ctx,
+                         std::set<std::string>& mirror_image_ids) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+
+    std::string last_read = "";
+    int max_read = 1024;
+    int r;
+    do {
+      std::map<std::string, std::string> mirror_images;
+      r =  cls_client::mirror_image_list(&io_ctx, last_read, max_read,
+                                             &mirror_images);
+      if (r < 0) {
+        lderr(cct) << "error listing mirrored image directory: "
+             << cpp_strerror(r) << dendl;
+        return r;
+      }
+      for (auto it = mirror_images.begin(); it != mirror_images.end(); ++it) {
+        mirror_image_ids.insert(it->first);
+      }
+      if (!mirror_images.empty()) {
+        last_read = mirror_images.rbegin()->first;
+      }
+      r = mirror_images.size();
+    } while (r == max_read);
+
+    return 0;
+  }
+
   int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode) {
     CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
     ldout(cct, 20) << __func__ << dendl;
@@ -2797,11 +2953,177 @@ int validate_mirroring_enabled(ImageCtx *ictx) {
       }
     }
 
+    if (current_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+      r = cls_client::mirror_mode_set(&io_ctx, cls::rbd::MIRROR_MODE_IMAGE);
+      if (r < 0) {
+        lderr(cct) << "Failed to set mirror mode to image: "
+          << cpp_strerror(r) << dendl;
+        return r;
+      }
+
+      r = MirroringWatcher<>::notify_mode_updated(io_ctx,
+                                                  cls::rbd::MIRROR_MODE_IMAGE);
+      if (r < 0) {
+        lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+                   << dendl;
+        return r;
+      }
+    }
+
+    if (next_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) {
+      return 0;
+    }
+
+    struct rollback_state_t {
+      IoCtx *io_ctx;
+      bool do_rollback;
+      cls::rbd::MirrorMode mirror_mode;
+      bool enable;
+      std::vector<std::pair<ImageCtx *, bool> > img_ctxs;
+
+      rollback_state_t(IoCtx *io_ctx, cls::rbd::MirrorMode mirror_mode, bool enable) :
+        io_ctx(io_ctx),
+        do_rollback(true),
+        mirror_mode(mirror_mode),
+        enable(enable) {
+      }
+      ~rollback_state_t() {
+        CephContext *cct = reinterpret_cast<CephContext *>(io_ctx->cct());
+        if (do_rollback && mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) {
+          int r = cls_client::mirror_mode_set(io_ctx, mirror_mode);
+          if (r < 0) {
+            lderr(cct) << "Failed to rollback mirror mode: " << cpp_strerror(r)
+              << dendl;
+          }
+
+          r = MirroringWatcher<>::notify_mode_updated(*io_ctx, mirror_mode);
+          if (r < 0) {
+            lderr(cct) << "failed to send update notification: "
+                       << cpp_strerror(r) << dendl;
+          }
+        }
+        for (const auto& pair : img_ctxs) {
+          if (do_rollback && pair.second) {
+            int r = enable ? mirror_image_disable(pair.first, false) :
+              mirror_image_enable(pair.first);
+            if (r < 0) {
+              lderr(cct) << "Failed to rollback mirroring state for image id "
+                << pair.first->id << ": " << cpp_strerror(r) << dendl;
+            }
+          }
+
+          int r = pair.first->state->close();
+          if (r < 0) {
+            lderr(cct) << "error closing image " << pair.first->id << ": "
+              << cpp_strerror(r) << dendl;
+          }
+        }
+      }
+    } rb_state(&io_ctx, current_mirror_mode,
+                     next_mirror_mode == cls::rbd::MIRROR_MODE_POOL);
+
+
+    if (next_mirror_mode == cls::rbd::MIRROR_MODE_POOL) {
+
+      map<string, string> images;
+      r = list_images_v2(io_ctx, images);
+      if (r < 0) {
+        lderr(cct) << "Failed listing images: " << cpp_strerror(r) << dendl;
+        return r;
+      }
+
+      for (const auto& img_pair : images) {
+        uint64_t features;
+        r = cls_client::get_features(&io_ctx, util::header_name(img_pair.second),
+                                     CEPH_NOSNAP, &features);
+        if (r < 0) {
+          lderr(cct) << "error getting features for image " << img_pair.first
+            << ": " << cpp_strerror(r) << dendl;
+          return r;
+        }
+
+        if ((features & RBD_FEATURE_JOURNALING) != 0) {
+          ImageCtx *img_ctx = new ImageCtx("", img_pair.second, nullptr,
+                                           io_ctx, false);
+          r = img_ctx->state->open();
+          if (r < 0) {
+            lderr(cct) << "error opening image "<< img_pair.first << ": "
+              << cpp_strerror(r) << dendl;
+            delete img_ctx;
+            return r;
+          }
+
+          r = mirror_image_enable(img_ctx);
+          if (r < 0) {
+            lderr(cct) << "error enabling mirroring for image "
+              << img_pair.first << ": " << cpp_strerror(r) << dendl;
+            rb_state.img_ctxs.push_back(std::make_pair(img_ctx, false));
+            return r;
+          }
+
+          rb_state.img_ctxs.push_back(std::make_pair(img_ctx, true));
+        }
+      }
+    } else if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+
+      std::set<std::string> image_ids;
+      r = list_mirror_images(io_ctx, image_ids);
+      if (r < 0) {
+        lderr(cct) << "Failed listing images: " << cpp_strerror(r) << dendl;
+        return r;
+      }
+
+      for (const auto& img_id : image_ids) {
+        if (current_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) {
+          cls::rbd::MirrorImage mirror_image;
+          r = cls_client::mirror_image_get(&io_ctx, img_id, &mirror_image);
+          if (r < 0 && r != -ENOENT) {
+            lderr(cct) << "failed to retrieve mirroring state for image id "
+              << img_id << ": " << cpp_strerror(r) << dendl;
+            return r;
+          }
+          if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+            lderr(cct) << "Failed to disable mirror mode: there are still "
+              "images with mirroring enabled" << dendl;
+            return -EINVAL;
+          }
+        } else {
+          ImageCtx *img_ctx = new ImageCtx("", img_id, nullptr, io_ctx, false);
+          r = img_ctx->state->open();
+          if (r < 0) {
+            lderr(cct) << "error opening image id "<< img_id << ": "
+              << cpp_strerror(r) << dendl;
+            delete img_ctx;
+            return r;
+          }
+
+          r = mirror_image_disable(img_ctx, false);
+          if (r < 0) {
+            lderr(cct) << "error disabling mirroring for image id " << img_id
+              << cpp_strerror(r) << dendl;
+            rb_state.img_ctxs.push_back(std::make_pair(img_ctx, false));
+            return r;
+          }
+
+          rb_state.img_ctxs.push_back(std::make_pair(img_ctx, true));
+        }
+      }
+    }
+
+    rb_state.do_rollback = false;
+
     r = cls_client::mirror_mode_set(&io_ctx, next_mirror_mode);
     if (r < 0) {
       lderr(cct) << "Failed to set mirror mode: " << cpp_strerror(r) << dendl;
       return r;
     }
+
+    r = MirroringWatcher<>::notify_mode_updated(io_ctx, next_mirror_mode);
+    if (r < 0) {
+      lderr(cct) << "failed to send update notification: " << cpp_strerror(r)
+                 << dendl;
+      return r;
+    }
     return 0;
   }
 
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index b92e572..c2413f4 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -96,6 +96,13 @@ namespace librbd {
 	     uint64_t stripe_unit, uint64_t stripe_count);
   int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
 	     ImageOptions& opts);
+  int create_v2(IoCtx& io_ctx, const char *imgname, uint64_t bid, uint64_t size,
+		int order, uint64_t features, uint64_t stripe_unit,
+		uint64_t stripe_count, uint8_t journal_order,
+		uint8_t journal_splay_width,
+		const std::string &journal_pool,
+                const std::string &non_primary_global_image_id,
+                const std::string &primary_mirror_uuid);
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
 	    IoCtx& c_ioctx, const char *c_name,
 	    uint64_t features, int *c_order,
diff --git a/src/librbd/journal/Policy.h b/src/librbd/journal/Policy.h
new file mode 100644
index 0000000..8265622
--- /dev/null
+++ b/src/librbd/journal/Policy.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_POLICY_H
+
+class Context;
+
+namespace librbd {
+
+namespace journal {
+
+struct Policy {
+  virtual ~Policy() {
+  }
+
+  virtual void allocate_tag_on_lock(Context *on_finish) = 0;
+  virtual void cancel_external_replay(Context *on_finish) = 0;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_POLICY_H
diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc
index 55ba5f3..aca64d9 100644
--- a/src/librbd/journal/Replay.cc
+++ b/src/librbd/journal/Replay.cc
@@ -589,6 +589,15 @@ void Replay<I>::handle_event(const journal::FlattenEvent &event,
 }
 
 template <typename I>
+void Replay<I>::handle_event(const journal::DemoteEvent &event,
+			     Context *on_ready, Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Demote event" << dendl;
+  on_ready->complete(0);
+  on_safe->complete(0);
+}
+
+template <typename I>
 void Replay<I>::handle_event(const journal::UnknownEvent &event,
 			     Context *on_ready, Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
diff --git a/src/librbd/journal/Replay.h b/src/librbd/journal/Replay.h
index 5be3406..a9d73c0 100644
--- a/src/librbd/journal/Replay.h
+++ b/src/librbd/journal/Replay.h
@@ -152,6 +152,8 @@ private:
                     Context *on_safe);
   void handle_event(const FlattenEvent &event, Context *on_ready,
                     Context *on_safe);
+  void handle_event(const DemoteEvent &event, Context *on_ready,
+                    Context *on_safe);
   void handle_event(const UnknownEvent &event, Context *on_ready,
                     Context *on_safe);
 
diff --git a/src/librbd/journal/StandardPolicy.cc b/src/librbd/journal/StandardPolicy.cc
new file mode 100644
index 0000000..9e71828
--- /dev/null
+++ b/src/librbd/journal/StandardPolicy.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/journal/StandardPolicy.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::journal::StandardPolicy: "
+
+namespace librbd {
+namespace journal {
+
+void StandardPolicy::allocate_tag_on_lock(Context *on_finish) {
+  assert(m_image_ctx->journal != nullptr);
+
+  if (!m_image_ctx->journal->is_tag_owner()) {
+    lderr(m_image_ctx->cct) << "local image not promoted" << dendl;
+    m_image_ctx->op_work_queue->queue(on_finish, -EPERM);
+    return;
+  }
+
+  m_image_ctx->journal->allocate_local_tag(on_finish);
+}
+
+void StandardPolicy::cancel_external_replay(Context *on_finish) {
+  // external replay is only handled by rbd-mirror
+  assert(false);
+}
+
+} // namespace journal
+} // namespace librbd
diff --git a/src/librbd/journal/StandardPolicy.h b/src/librbd/journal/StandardPolicy.h
new file mode 100644
index 0000000..c49ec9c
--- /dev/null
+++ b/src/librbd/journal/StandardPolicy.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
+#define CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
+
+#include "librbd/journal/Policy.h"
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace journal {
+
+class StandardPolicy : public Policy {
+public:
+  StandardPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) {
+  }
+
+  virtual void allocate_tag_on_lock(Context *on_finish);
+  virtual void cancel_external_replay(Context *on_finish);
+
+private:
+  ImageCtx *m_image_ctx;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H
diff --git a/src/librbd/journal/TypeTraits.h b/src/librbd/journal/TypeTraits.h
new file mode 100644
index 0000000..d6dde69
--- /dev/null
+++ b/src/librbd/journal/TypeTraits.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
+#define CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
+
+namespace journal {
+class Future;
+class Journaler;
+class ReplayEntry;
+}
+
+namespace librbd {
+namespace journal {
+
+template <typename ImageCtxT>
+struct TypeTraits {
+  typedef ::journal::Journaler Journaler;
+  typedef ::journal::Future Future;
+  typedef ::journal::ReplayEntry ReplayEntry;
+};
+
+} // namespace journal
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H
diff --git a/src/librbd/journal/Types.cc b/src/librbd/journal/Types.cc
index 86fc7e0..8f9f942 100644
--- a/src/librbd/journal/Types.cc
+++ b/src/librbd/journal/Types.cc
@@ -200,6 +200,15 @@ void ResizeEvent::dump(Formatter *f) const {
   f->dump_unsigned("size", size);
 }
 
+void DemoteEvent::encode(bufferlist& bl) const {
+}
+
+void DemoteEvent::decode(__u8 version, bufferlist::iterator& it) {
+}
+
+void DemoteEvent::dump(Formatter *f) const {
+}
+
 void UnknownEvent::encode(bufferlist& bl) const {
   assert(false);
 }
@@ -267,6 +276,9 @@ void EventEntry::decode(bufferlist::iterator& it) {
   case EVENT_TYPE_FLATTEN:
     event = FlattenEvent();
     break;
+  case EVENT_TYPE_DEMOTE:
+    event = DemoteEvent();
+    break;
   default:
     event = UnknownEvent();
     break;
@@ -318,6 +330,8 @@ void EventEntry::generate_test_instances(std::list<EventEntry *> &o) {
   o.push_back(new EventEntry(ResizeEvent(901, 1234)));
 
   o.push_back(new EventEntry(FlattenEvent(123)));
+
+  o.push_back(new EventEntry(DemoteEvent()));
 }
 
 // Journal Client
@@ -359,6 +373,7 @@ void MirrorPeerSyncPoint::dump(Formatter *f) const {
 
 void MirrorPeerClientMeta::encode(bufferlist& bl) const {
   ::encode(image_id, bl);
+  ::encode(static_cast<uint32_t>(state), bl);
   ::encode(sync_object_count, bl);
   ::encode(static_cast<uint32_t>(sync_points.size()), bl);
   for (auto &sync_point : sync_points) {
@@ -369,6 +384,11 @@ void MirrorPeerClientMeta::encode(bufferlist& bl) const {
 
 void MirrorPeerClientMeta::decode(__u8 version, bufferlist::iterator& it) {
   ::decode(image_id, it);
+
+  uint32_t decode_state;
+  ::decode(decode_state, it);
+  state = static_cast<MirrorPeerState>(decode_state);
+
   ::decode(sync_object_count, it);
 
   uint32_t sync_point_count;
@@ -383,6 +403,7 @@ void MirrorPeerClientMeta::decode(__u8 version, bufferlist::iterator& it) {
 
 void MirrorPeerClientMeta::dump(Formatter *f) const {
   f->dump_string("image_id", image_id);
+  f->dump_stream("state") << state;
   f->dump_unsigned("sync_object_count", sync_object_count);
   f->open_array_section("sync_points");
   for (auto &sync_point : sync_points) {
@@ -546,6 +567,9 @@ std::ostream &operator<<(std::ostream &out, const EventType &type) {
   case EVENT_TYPE_FLATTEN:
     out << "Flatten";
     break;
+  case EVENT_TYPE_DEMOTE:
+    out << "Demote";
+    break;
   default:
     out << "Unknown (" << static_cast<uint32_t>(type) << ")";
     break;
@@ -588,8 +612,24 @@ std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync) {
   return out;
 }
 
+std::ostream &operator<<(std::ostream &out, const MirrorPeerState &state) {
+  switch (state) {
+  case MIRROR_PEER_STATE_SYNCING:
+    out << "Syncing";
+    break;
+  case MIRROR_PEER_STATE_REPLAYING:
+    out << "Replaying";
+    break;
+  default:
+    out << "Unknown (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return out;
+}
+
 std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta) {
   out << "[image_id=" << meta.image_id << ", "
+      << "state=" << meta.state << ", "
       << "sync_object_count=" << meta.sync_object_count << ", "
       << "sync_points=[";
   std::string delimiter;
diff --git a/src/librbd/journal/Types.h b/src/librbd/journal/Types.h
index 2080399..0f25766 100644
--- a/src/librbd/journal/Types.h
+++ b/src/librbd/journal/Types.h
@@ -34,7 +34,8 @@ enum EventType {
   EVENT_TYPE_SNAP_ROLLBACK  = 9,
   EVENT_TYPE_RENAME         = 10,
   EVENT_TYPE_RESIZE         = 11,
-  EVENT_TYPE_FLATTEN        = 12
+  EVENT_TYPE_FLATTEN        = 12,
+  EVENT_TYPE_DEMOTE         = 13
 };
 
 struct AioDiscardEvent {
@@ -256,6 +257,14 @@ struct FlattenEvent : public OpEventBase {
   using OpEventBase::dump;
 };
 
+struct DemoteEvent {
+  static const EventType TYPE = static_cast<EventType>(EVENT_TYPE_DEMOTE);
+
+  void encode(bufferlist& bl) const;
+  void decode(__u8 version, bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+};
+
 struct UnknownEvent {
   static const EventType TYPE = static_cast<EventType>(-1);
 
@@ -277,6 +286,7 @@ typedef boost::variant<AioDiscardEvent,
                        RenameEvent,
                        ResizeEvent,
                        FlattenEvent,
+                       DemoteEvent,
                        UnknownEvent> Event;
 
 struct EventEntry {
@@ -351,6 +361,11 @@ struct MirrorPeerSyncPoint {
   void dump(Formatter *f) const;
 };
 
+enum MirrorPeerState {
+  MIRROR_PEER_STATE_SYNCING,
+  MIRROR_PEER_STATE_REPLAYING
+};
+
 struct MirrorPeerClientMeta {
   typedef std::list<MirrorPeerSyncPoint> SyncPoints;
   typedef std::map<uint64_t, uint64_t> SnapSeqs;
@@ -358,6 +373,7 @@ struct MirrorPeerClientMeta {
   static const ClientMetaType TYPE = MIRROR_PEER_CLIENT_META_TYPE;
 
   std::string image_id;
+  MirrorPeerState state = MIRROR_PEER_STATE_SYNCING; ///< replay state
   uint64_t sync_object_count = 0; ///< maximum number of objects ever sync'ed
   SyncPoints sync_points;         ///< max two in-use snapshots for sync
   SnapSeqs snap_seqs;             ///< local to peer snap seq mapping
@@ -372,6 +388,7 @@ struct MirrorPeerClientMeta {
 
   inline bool operator==(const MirrorPeerClientMeta &meta) const {
     return (image_id == meta.image_id &&
+            state == meta.state &&
             sync_object_count == meta.sync_object_count &&
             sync_points == meta.sync_points &&
             snap_seqs == meta.snap_seqs);
@@ -458,6 +475,7 @@ std::ostream &operator<<(std::ostream &out, const EventType &type);
 std::ostream &operator<<(std::ostream &out, const ClientMetaType &type);
 std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta);
 std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync);
+std::ostream &operator<<(std::ostream &out, const MirrorPeerState &meta);
 std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta);
 std::ostream &operator<<(std::ostream &out, const TagData &tag_data);
 
diff --git a/src/librbd/mirroring_watcher/Types.cc b/src/librbd/mirroring_watcher/Types.cc
new file mode 100644
index 0000000..f81b4ba
--- /dev/null
+++ b/src/librbd/mirroring_watcher/Types.cc
@@ -0,0 +1,160 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/mirroring_watcher/Types.h"
+#include "include/assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+namespace librbd {
+namespace mirroring_watcher {
+
+namespace {
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+  explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+  template <typename Payload>
+  inline void operator()(const Payload &payload) const {
+    ::encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl);
+    payload.encode(m_bl);
+  }
+
+private:
+  bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+  DecodePayloadVisitor(__u8 version, bufferlist::iterator &iter)
+    : m_version(version), m_iter(iter) {}
+
+  template <typename Payload>
+  inline void operator()(Payload &payload) const {
+    payload.decode(m_version, m_iter);
+  }
+
+private:
+  __u8 m_version;
+  bufferlist::iterator &m_iter;
+};
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+  explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+  template <typename Payload>
+  inline void operator()(const Payload &payload) const {
+    NotifyOp notify_op = Payload::NOTIFY_OP;
+    m_formatter->dump_string("notify_op", stringify(notify_op));
+    payload.dump(m_formatter);
+  }
+
+private:
+  ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void ModeUpdatedPayload::encode(bufferlist &bl) const {
+  ::encode(static_cast<uint32_t>(mirror_mode), bl);
+}
+
+void ModeUpdatedPayload::decode(__u8 version, bufferlist::iterator &iter) {
+  uint32_t mirror_mode_decode;
+  ::decode(mirror_mode_decode, iter);
+  mirror_mode = static_cast<cls::rbd::MirrorMode>(mirror_mode_decode);
+}
+
+void ModeUpdatedPayload::dump(Formatter *f) const {
+  f->dump_stream("mirror_mode") << mirror_mode;
+}
+
+void ImageUpdatedPayload::encode(bufferlist &bl) const {
+  ::encode(static_cast<uint32_t>(mirror_image_state), bl);
+  ::encode(image_id, bl);
+  ::encode(global_image_id, bl);
+}
+
+void ImageUpdatedPayload::decode(__u8 version, bufferlist::iterator &iter) {
+  uint32_t mirror_image_state_decode;
+  ::decode(mirror_image_state_decode, iter);
+  mirror_image_state = static_cast<cls::rbd::MirrorImageState>(
+    mirror_image_state_decode);
+  ::decode(image_id, iter);
+  ::decode(global_image_id, iter);
+}
+
+void ImageUpdatedPayload::dump(Formatter *f) const {
+  f->dump_stream("mirror_image_state") << mirror_image_state;
+  f->dump_string("image_id", image_id);
+  f->dump_string("global_image_id", global_image_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+  assert(false);
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+  ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::iterator& iter) {
+  DECODE_START(1, iter);
+
+  uint32_t notify_op;
+  ::decode(notify_op, iter);
+
+  // select the correct payload variant based upon the encoded op
+  switch (notify_op) {
+  case NOTIFY_OP_MODE_UPDATED:
+    payload = ModeUpdatedPayload();
+    break;
+  case NOTIFY_OP_IMAGE_UPDATED:
+    payload = ImageUpdatedPayload();
+    break;
+  default:
+    payload = UnknownPayload();
+    break;
+  }
+
+  apply_visitor(DecodePayloadVisitor(struct_v, iter), payload);
+  DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+  apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+  o.push_back(new NotifyMessage(ModeUpdatedPayload(cls::rbd::MIRROR_MODE_DISABLED)));
+  o.push_back(new NotifyMessage(ImageUpdatedPayload(cls::rbd::MIRROR_IMAGE_STATE_DISABLING,
+                                                    "image id", "global image id")));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+  switch (op) {
+  case NOTIFY_OP_MODE_UPDATED:
+    out << "ModeUpdated";
+    break;
+  case NOTIFY_OP_IMAGE_UPDATED:
+    out << "ImageUpdated";
+    break;
+  default:
+    out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+    break;
+  }
+  return out;
+}
+
+} // namespace mirroring_watcher
+} // namespace librbd
diff --git a/src/librbd/mirroring_watcher/Types.h b/src/librbd/mirroring_watcher/Types.h
new file mode 100644
index 0000000..16bf5b6
--- /dev/null
+++ b/src/librbd/mirroring_watcher/Types.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
+#define CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace librbd {
+namespace mirroring_watcher {
+
+enum NotifyOp {
+  NOTIFY_OP_MODE_UPDATED  = 0,
+  NOTIFY_OP_IMAGE_UPDATED = 1
+};
+
+struct ModeUpdatedPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_MODE_UPDATED;
+
+  cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+
+  ModeUpdatedPayload() {
+  }
+  ModeUpdatedPayload(cls::rbd::MirrorMode mirror_mode)
+    : mirror_mode(mirror_mode) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+};
+
+struct ImageUpdatedPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_UPDATED;
+
+  cls::rbd::MirrorImageState mirror_image_state =
+    cls::rbd::MIRROR_IMAGE_STATE_ENABLED;
+  std::string image_id;
+  std::string global_image_id;
+
+  ImageUpdatedPayload() {
+  }
+  ImageUpdatedPayload(cls::rbd::MirrorImageState mirror_image_state,
+                      const std::string &image_id,
+                      const std::string &global_image_id)
+    : mirror_image_state(mirror_image_state), image_id(image_id),
+      global_image_id(global_image_id) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+  static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+  UnknownPayload() {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ModeUpdatedPayload,
+                       ImageUpdatedPayload,
+                       UnknownPayload> Payload;
+
+struct NotifyMessage {
+  NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+  }
+
+  Payload payload;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace mirroring_watcher
+} // namespace librbd
+
+using librbd::mirroring_watcher::encode;
+using librbd::mirroring_watcher::decode;
+
+#endif // CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 81de9e4..16f20ba 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -34,14 +34,14 @@
 
 Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
   Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
-  name(name_), awaiting_seq(-1)
+  name(name_), standby_for_rank(MDSMap::MDS_NO_STANDBY_PREF),
+  standby_for_fscid(FS_CLUSTER_ID_NONE), awaiting_seq(-1)
 {
   want_state = MDSMap::STATE_NULL;
   last_seq = 0;
   sender = NULL;
   was_laggy = false;
 
-  standby_for_rank = MDSMap::MDS_NO_STANDBY_PREF;
   epoch = 0;
 }
 
@@ -52,7 +52,8 @@ Beacon::~Beacon()
 
 
 void Beacon::init(MDSMap const *mdsmap, MDSMap::DaemonState want_state_,
-    mds_rank_t standby_rank_, std::string const & standby_name_)
+    mds_rank_t standby_rank_, std::string const & standby_name_,
+    fs_cluster_id_t standby_fscid_)
 {
   Mutex::Locker l(lock);
   assert(mdsmap != NULL);
@@ -61,6 +62,7 @@ void Beacon::init(MDSMap const *mdsmap, MDSMap::DaemonState want_state_,
   _notify_mdsmap(mdsmap);
   standby_for_rank = standby_rank_;
   standby_for_name = standby_name_;
+  standby_for_fscid = standby_fscid_;
 
   // Spawn threads and start messaging
   timer.init();
@@ -207,6 +209,7 @@ void Beacon::_send()
 
   beacon->set_standby_for_rank(standby_for_rank);
   beacon->set_standby_for_name(standby_for_name);
+  beacon->set_standby_for_fscid(standby_for_fscid);
   beacon->set_health(health);
   beacon->set_compat(compat);
   // piggyback the sys info on beacon msg
diff --git a/src/mds/Beacon.h b/src/mds/Beacon.h
index 29efb4a..e8368cf 100644
--- a/src/mds/Beacon.h
+++ b/src/mds/Beacon.h
@@ -50,6 +50,7 @@ class Beacon : public Dispatcher
   CompatSet compat;
   mds_rank_t standby_for_rank;
   std::string standby_for_name;
+  fs_cluster_id_t standby_for_fscid;
   MDSMap::DaemonState want_state;
 
   // Internal beacon state
@@ -85,7 +86,9 @@ public:
   Beacon(CephContext *cct_, MonClient *monc_, std::string name);
   ~Beacon();
 
-  void init(MDSMap const *mdsmap, MDSMap::DaemonState want_state_, mds_rank_t standby_rank_, std::string const &standby_name_);
+  void init(MDSMap const *mdsmap, MDSMap::DaemonState want_state_,
+      mds_rank_t standby_rank_, std::string const &standby_name_,
+      fs_cluster_id_t standby_fscid_);
   void shutdown();
 
   bool ms_dispatch(Message *m); 
@@ -96,8 +99,6 @@ public:
   void notify_mdsmap(MDSMap const *mdsmap);
   void notify_health(MDSRank const *mds);
 
-  void set_standby_for(mds_rank_t rank_, std::string const &name_);
-
   void handle_mds_beacon(MMDSBeacon *m);
   void send();
 
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 28a1b1a..423be96 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -4173,12 +4173,10 @@ void CInode::scrub_initialize(CDentry *scrub_parent,
 			      MDSInternalContextBase *f)
 {
   dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
-  assert(!scrub_infop || !scrub_infop->scrub_in_progress);
+  assert(!scrub_is_in_progress());
   scrub_info();
   if (!scrub_infop)
     scrub_infop = new scrub_info_t();
-  else
-    assert(!scrub_infop->scrub_in_progress);
 
   if (get_projected_inode()->is_dir()) {
     // fill in dirfrag_stamps with initial state
@@ -4210,7 +4208,7 @@ void CInode::scrub_initialize(CDentry *scrub_parent,
 int CInode::scrub_dirfrag_next(frag_t* out_dirfrag)
 {
   dout(20) << __func__ << dendl;
-  assert(scrub_infop && scrub_infop->scrub_in_progress);
+  assert(scrub_is_in_progress());
 
   if (!is_dir()) {
     return -ENOTDIR;
@@ -4258,7 +4256,7 @@ void CInode::scrub_dirfrags_scrubbing(list<frag_t>* out_dirfrags)
 void CInode::scrub_dirfrag_finished(frag_t dirfrag)
 {
   dout(20) << __func__ << " on frag " << dirfrag << dendl;
-  assert(scrub_infop && scrub_infop->scrub_in_progress);
+  assert(scrub_is_in_progress());
 
   std::map<frag_t, scrub_stamp_info_t>::iterator i =
       scrub_infop->dirfrag_stamps.find(dirfrag);
@@ -4271,7 +4269,7 @@ void CInode::scrub_dirfrag_finished(frag_t dirfrag)
 
 void CInode::scrub_finished(MDSInternalContextBase **c) {
   dout(20) << __func__ << dendl;
-  assert(scrub_info()->scrub_in_progress);
+  assert(scrub_is_in_progress());
   for (std::map<frag_t, scrub_stamp_info_t>::iterator i =
       scrub_infop->dirfrag_stamps.begin();
       i != scrub_infop->dirfrag_stamps.end();
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 9260c4b..01f6797 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -288,6 +288,10 @@ public:
       scrub_info_create();
     return scrub_infop;
   }
+
+  bool scrub_is_in_progress() const {
+    return (scrub_infop && scrub_infop->scrub_in_progress);
+  }
   /**
    * Start scrubbing on this inode. That could be very short if it's
    * a file, or take a long time if we're recursively scrubbing a directory.
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
index 094b78e..29f94dc 100644
--- a/src/mds/FSMap.cc
+++ b/src/mds/FSMap.cc
@@ -75,12 +75,29 @@ void FSMap::generate_test_instances(list<FSMap*>& ls)
 
 void FSMap::print(ostream& out) const
 {
-  // TODO add a non-json print?
-  JSONFormatter f(true);
-  f.open_object_section("fsmap");
-  dump(&f);
-  f.close_section();
-  f.flush(out);
+  out << "e" << epoch << std::endl;
+  out << "enable_multiple: " << enable_multiple << std::endl;
+  out << "compat: " << enable_multiple << std::endl;
+  out << " " << std::endl;
+
+  if (filesystems.empty()) {
+    out << "No filesystems configured" << std::endl;
+    return;
+  }
+
+  for (const auto &fs : filesystems) {
+    fs.second->print(out);
+    out << " " << std::endl << " " << std::endl;  // Space out a bit
+  }
+
+  if (!standby_daemons.empty()) {
+    out << "Standby daemons:" << std::endl << " " << std::endl;
+  }
+
+  for (const auto &p : standby_daemons) {
+    p.second.print_summary(out);
+    out << std::endl;
+  }
 }
 
 
@@ -224,7 +241,7 @@ void FSMap::encode(bufferlist& bl, uint64_t features) const
     for (auto i : filesystems) {
       fs_list.push_back(*(i.second));
     }
-    ::encode(fs_list, bl);
+    ::encode(fs_list, bl, features);
     ::encode(mds_roles, bl);
     ::encode(standby_daemons, bl, features);
     ::encode(standby_epochs, bl);
@@ -405,12 +422,12 @@ void FSMap::decode(bufferlist::iterator& p)
 }
 
 
-void Filesystem::encode(bufferlist& bl) const
+void Filesystem::encode(bufferlist& bl, uint64_t features) const
 {
   ENCODE_START(1, 1, bl);
   ::encode(fscid, bl);
   bufferlist mdsmap_bl;
-  mds_map.encode(mdsmap_bl, CEPH_FEATURE_PGID64 | CEPH_FEATURE_MDSENC);
+  mds_map.encode(mdsmap_bl, features);
   ::encode(mdsmap_bl, bl);
   ENCODE_FINISH(bl);
 }
@@ -449,10 +466,9 @@ int FSMap::parse_filesystem(
 
 void Filesystem::print(std::ostream &out) const
 {
-  // TODO add a non-json print?
-  JSONFormatter f;
-  dump(&f);
-  f.flush(out);
+  out << "Filesystem '" << mds_map.fs_name
+      << "' (" << fscid << ")" << std::endl;
+  mds_map.print(out);
 }
 
 mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) const
@@ -479,14 +495,18 @@ mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) cons
       continue;
     }
 
-    if ((info.standby_for_rank == role.rank && info.standby_for_ns == role.fscid)
+    if ((info.standby_for_rank == role.rank && info.standby_for_fscid == role.fscid)
         || (name.length() && info.standby_for_name == name)) {
       // It's a named standby for *me*, use it.
       return gid;
-    } else if (info.standby_for_rank < 0 && info.standby_for_name.length() == 0)
-      // It's not a named standby for anyone, use it if we don't find
-      // a named standby for me later.
-      result = gid;
+    } else if (
+        info.standby_for_rank < 0 && info.standby_for_name.length() == 0 &&
+        (info.standby_for_fscid == FS_CLUSTER_ID_NONE ||
+         info.standby_for_fscid == role.fscid)) {
+        // It's not a named standby for anyone, use it if we don't find
+        // a named standby for me later, unless it targets another FSCID.
+        result = gid;
+      }
   }
 
   return result;
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
index e14173f..1f6b069 100644
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -57,7 +57,7 @@ class Filesystem
   fs_cluster_id_t fscid;
   MDSMap mds_map;
 
-  void encode(bufferlist& bl) const;
+  void encode(bufferlist& bl, uint64_t features) const;
   void decode(bufferlist::iterator& p);
 
   Filesystem()
@@ -86,7 +86,7 @@ class Filesystem
     return false;
   }
 };
-WRITE_CLASS_ENCODER(Filesystem)
+WRITE_CLASS_ENCODER_FEATURES(Filesystem)
 
 class FSMap {
 protected:
@@ -336,7 +336,7 @@ public:
    * for the one we had previously.  Impose the new one
    * on all filesystems.
    */
-  void update_compat(CompatSet c)
+  void update_compat(const CompatSet &c)
   {
     // We could do something more complicated here to enable
     // different filesystems to be served by different MDS versions,
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 9eec056..d9e5901 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -10569,6 +10569,11 @@ bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
     return false;
   }
 
+  if (diri->scrub_is_in_progress()) {
+    dout(7) << "can_fragment: scrub in progress" << dendl;
+    return false;
+  }
+
   for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
     CDir *dir = *p;
     if (dir->state_test(CDir::STATE_FRAGMENTING)) {
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index 293d7c8..daa0cb0 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -559,7 +559,9 @@ int MDSDaemon::init(MDSMap::DaemonState wanted_state)
   if (wanted_state == MDSMap::STATE_NULL) {
     wanted_state = MDSMap::STATE_BOOT;
   }
-  beacon.init(mdsmap, wanted_state, standby_for_rank, standby_for_name);
+  beacon.init(mdsmap, wanted_state,
+    standby_for_rank, standby_for_name,
+    fs_cluster_id_t(g_conf->mds_standby_for_fscid));
   messenger->set_myname(entity_name_t::MDS(MDS_RANK_NONE));
 
   // schedule tick
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index e9b4e85..9a7c26f 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -81,15 +81,42 @@ void MDSMap::mds_info_t::dump(Formatter *f) const
     f->dump_stream("laggy_since") << laggy_since;
   
   f->dump_int("standby_for_rank", standby_for_rank);
-  f->dump_int("standby_for_ns", standby_for_ns);
+  f->dump_int("standby_for_fscid", standby_for_fscid);
   f->dump_string("standby_for_name", standby_for_name);
   f->open_array_section("export_targets");
   for (set<mds_rank_t>::iterator p = export_targets.begin();
        p != export_targets.end(); ++p) {
     f->dump_int("mds", *p);
   }
-  f->dump_unsigned("features", mds_features);
   f->close_section();
+  f->dump_unsigned("features", mds_features);
+}
+
+void MDSMap::mds_info_t::print_summary(ostream &out) const
+{
+  out << global_id << ":\t"
+      << addr
+      << " '" << name << "'"
+      << " mds." << rank
+      << "." << inc
+      << " " << ceph_mds_state_name(state)
+      << " seq " << state_seq;
+  if (laggy()) {
+    out << " laggy since " << laggy_since;
+  }
+  if (standby_for_rank != -1 ||
+      !standby_for_name.empty()) {
+    out << " (standby for";
+    //if (standby_for_rank >= 0)
+      out << " rank " << standby_for_rank;
+    if (!standby_for_name.empty()) {
+      out << " '" << standby_for_name << "'";
+    }
+    out << ")";
+  }
+  if (!export_targets.empty()) {
+    out << " export_targets=" << export_targets;
+  }
 }
 
 void MDSMap::mds_info_t::generate_test_instances(list<mds_info_t*>& ls)
@@ -210,28 +237,8 @@ void MDSMap::print(ostream& out) const
 
   for (const auto &p : foo) {
     const mds_info_t& info = mds_info.at(p.second);
-    
-    out << p.second << ":\t"
-	<< info.addr
-	<< " '" << info.name << "'"
-	<< " mds." << info.rank
-	<< "." << info.inc
-	<< " " << ceph_mds_state_name(info.state)
-	<< " seq " << info.state_seq;
-    if (info.laggy())
-      out << " laggy since " << info.laggy_since;
-    if (info.standby_for_rank != -1 ||
-	!info.standby_for_name.empty()) {
-      out << " (standby for";
-      //if (info.standby_for_rank >= 0)
-	out << " rank " << info.standby_for_rank;
-      if (!info.standby_for_name.empty())
-	out << " '" << info.standby_for_name << "'";
-      out << ")";
-    }
-    if (!info.export_targets.empty())
-      out << " export_targets=" << info.export_targets;
-    out << "\n";    
+    info.print_summary(out);
+    out << "\n";
   }
 }
 
@@ -412,7 +419,7 @@ void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) con
   ::encode(standby_for_name, bl);
   ::encode(export_targets, bl);
   ::encode(mds_features, bl);
-  ::encode(standby_for_ns, bl);
+  ::encode(standby_for_fscid, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -451,7 +458,7 @@ void MDSMap::mds_info_t::decode(bufferlist::iterator& bl)
   if (struct_v >= 5)
     ::decode(mds_features, bl);
   if (struct_v >= 6) {
-    ::decode(standby_for_ns, bl);
+    ::decode(standby_for_fscid, bl);
   }
   DECODE_FINISH(bl);
 }
@@ -667,14 +674,14 @@ MDSMap::availability_t MDSMap::is_cluster_available() const
     return STUCK_UNAVAILABLE;
   }
 
-  for (const auto rank : in) {                                                  
-  if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
-    // This might only be transient, but because we can't see
-    // standbys, we have no way of knowing whether there is a
-    // standby available to replace the laggy guy.
-    return STUCK_UNAVAILABLE;                                                 
-  }                                                                           
-}   
+  for (const auto rank : in) {
+    if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
+      // This might only be transient, but because we can't see
+      // standbys, we have no way of knowing whether there is a
+      // standby available to replace the laggy guy.
+      return STUCK_UNAVAILABLE;
+    }
+  }
 
   if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
     // Nobody looks stuck, so indicate to client they should go ahead
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 314fe4e..2842f93 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -140,13 +140,13 @@ public:
     utime_t laggy_since;
     mds_rank_t standby_for_rank;
     std::string standby_for_name;
-    fs_cluster_id_t standby_for_ns;
+    fs_cluster_id_t standby_for_fscid;
     std::set<mds_rank_t> export_targets;
     uint64_t mds_features;
 
     mds_info_t() : global_id(MDS_GID_NONE), rank(MDS_RANK_NONE), inc(0), state(STATE_STANDBY), state_seq(0),
 		   standby_for_rank(MDS_NO_STANDBY_PREF),
-                   standby_for_ns(FS_CLUSTER_ID_NONE)
+                   standby_for_fscid(FS_CLUSTER_ID_NONE)
     { }
 
     bool laggy() const { return !(laggy_since == utime_t()); }
@@ -160,6 +160,7 @@ public:
     }
     void decode(bufferlist::iterator& p);
     void dump(Formatter *f) const;
+    void print_summary(ostream &out) const;
     static void generate_test_instances(list<mds_info_t*>& ls);
   private:
     void encode_versioned(bufferlist& bl, uint64_t features) const;
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 1647c70..71e4925 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -1666,27 +1666,19 @@ bool MDSRankDispatcher::handle_asok_command(
 {
   if (command == "dump_ops_in_flight" ||
              command == "ops") {
-    RWLock::RLocker l(op_tracker.lock);
-    if (!op_tracker.tracking_enabled) {
+    if (!op_tracker.dump_ops_in_flight(f)) {
       ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
 	  please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
-    } else {
-      op_tracker.dump_ops_in_flight(f);
     }
   } else if (command == "dump_blocked_ops") {
-    if (!op_tracker.tracking_enabled) {
+    if (!op_tracker.dump_ops_in_flight(f, true)) {
       ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
 	Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
-    } else {
-      op_tracker.dump_ops_in_flight(f, true);
     }
   } else if (command == "dump_historic_ops") {
-    RWLock::RLocker l(op_tracker.lock);
-    if (!op_tracker.tracking_enabled) {
+    if (!op_tracker.dump_historic_ops(f)) {
       ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
 	  please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
-    } else {
-      op_tracker.dump_historic_ops(f);
     }
   } else if (command == "osdmap barrier") {
     int64_t target_epoch = 0;
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
index 1eaddf6..7da1b4b 100644
--- a/src/mds/ScrubStack.cc
+++ b/src/mds/ScrubStack.cc
@@ -146,11 +146,15 @@ void ScrubStack::scrub_dir_inode(CInode *in,
 	++i) {
       // turn frags into CDir *
       CDir *dir = in->get_dirfrag(*i);
-      scrubbing_cdirs.push_back(dir);
-      dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl;
+      if (dir) {
+	scrubbing_cdirs.push_back(dir);
+	dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl;
+      } else {
+	in->scrub_dirfrag_finished(*i);
+	dout(25) << __func__ << " missing dirfrag " << *i << " skip scrubbing" << dendl;
+      }
     }
 
-
     dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size()
 	     << " scrubbing cdirs" << dendl;
 
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index ee2af10..750fb41 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -2769,10 +2769,30 @@ void Server::handle_client_lookup_ino(MDRequestRef& mdr,
 
   CDentry *dn = in->get_projected_parent_dn();
   CInode *diri = dn ? dn->get_dir()->inode : NULL;
+
+  set<SimpleLock*> rdlocks;
   if (dn && (want_parent || want_dentry)) {
     mdr->pin(dn);
-    set<SimpleLock*> rdlocks, wrlocks, xlocks;
     rdlocks.insert(&dn->lock);
+  }
+
+  unsigned mask = req->head.args.getattr.mask;
+  if (mask) {
+    Capability *cap = in->get_client_cap(mdr->get_client());
+    int issued = 0;
+    if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+      issued = cap->issued();
+    // permission bits, ACL/security xattrs
+    if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+      rdlocks.insert(&in->authlock);
+    if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+      rdlocks.insert(&in->xattrlock);
+
+    mdr->getattr_caps = mask;
+  }
+
+  if (!rdlocks.empty()) {
+    set<SimpleLock*> wrlocks, xlocks;
     if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
       return;
 
@@ -2909,6 +2929,21 @@ void Server::handle_client_open(MDRequestRef& mdr)
     return;
   }
 
+  unsigned mask = req->head.args.open.mask;
+  if (mask) {
+    Capability *cap = cur->get_client_cap(mdr->get_client());
+    int issued = 0;
+    if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+      issued = cap->issued();
+    // permission bits, ACL/security xattrs
+    if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+      rdlocks.insert(&cur->authlock);
+    if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+      rdlocks.insert(&cur->xattrlock);
+
+    mdr->getattr_caps = mask;
+  }
+
   // O_TRUNC
   if ((flags & O_TRUNC) && !mdr->has_completed) {
     assert(cur->is_auth());
@@ -2945,7 +2980,7 @@ void Server::handle_client_open(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
-  int mask = MAY_READ;
+  mask = MAY_READ;
   if (cmode & CEPH_FILE_MODE_WR)
     mask |= MAY_WRITE;
   if (!check_access(mdr, cur, mask))
diff --git a/src/messages/MFSMap.h b/src/messages/MFSMap.h
index dd886b5..b0b28b1 100644
--- a/src/messages/MFSMap.h
+++ b/src/messages/MFSMap.h
@@ -26,16 +26,18 @@ class MFSMap : public Message {
   bufferlist encoded;
 
   version_t get_epoch() const { return epoch; }
-  bufferlist& get_encoded() { return encoded; }
+  const FSMap & get_fsmap() {return fsmap;}
 
   MFSMap() : 
     Message(CEPH_MSG_FS_MAP), epoch(0) {}
-  MFSMap(const uuid_d &f, FSMap *fsmap) :
-    Message(CEPH_MSG_FS_MAP), epoch(fsmap->get_epoch())
+  MFSMap(const uuid_d &f, const FSMap &fsmap_) :
+    Message(CEPH_MSG_FS_MAP), epoch(fsmap_.get_epoch())
   {
-    fsmap->encode(encoded, -1);
+    fsmap = fsmap_;
   }
 private:
+  FSMap fsmap;
+
   ~MFSMap() {}
 
 public:
@@ -48,11 +50,11 @@ public:
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
     ::decode(epoch, p);
-    ::decode(encoded, p);
+    ::decode(fsmap, p);
   }
   void encode_payload(uint64_t features) {
     ::encode(epoch, payload);
-    ::encode(encoded, payload);
+    ::encode(fsmap, payload, features);
   }
 };
 
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index d932f8a..a155075 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -121,7 +121,7 @@ WRITE_CLASS_ENCODER(MDSHealth)
 
 class MMDSBeacon : public PaxosServiceMessage {
 
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 2;
 
   uuid_d fsid;
@@ -130,8 +130,10 @@ class MMDSBeacon : public PaxosServiceMessage {
 
   MDSMap::DaemonState state;
   version_t seq;
-  mds_rank_t standby_for_rank;
-  string standby_for_name;
+
+  mds_rank_t      standby_for_rank;
+  string          standby_for_name;
+  fs_cluster_id_t standby_for_fscid;
 
   CompatSet compat;
 
@@ -146,7 +148,7 @@ class MMDSBeacon : public PaxosServiceMessage {
   MMDSBeacon(const uuid_d &f, mds_gid_t g, string& n, epoch_t les, MDSMap::DaemonState st, version_t se, uint64_t feat) :
     PaxosServiceMessage(MSG_MDS_BEACON, les, HEAD_VERSION, COMPAT_VERSION),
     fsid(f), global_id(g), name(n), state(st), seq(se),
-    standby_for_rank(MDS_RANK_NONE),
+    standby_for_rank(MDS_RANK_NONE), standby_for_fscid(FS_CLUSTER_ID_NONE),
     mds_features(feat) {
   }
 private:
@@ -162,6 +164,7 @@ public:
   const char *get_type_name() const { return "mdsbeacon"; }
   mds_rank_t get_standby_for_rank() { return standby_for_rank; }
   const string& get_standby_for_name() { return standby_for_name; }
+  const fs_cluster_id_t& get_standby_for_fscid() { return standby_for_fscid; }
   uint64_t get_mds_features() const { return mds_features; }
 
   CompatSet const& get_compat() const { return compat; }
@@ -173,6 +176,7 @@ public:
   void set_standby_for_rank(mds_rank_t r) { standby_for_rank = r; }
   void set_standby_for_name(string& n) { standby_for_name = n; }
   void set_standby_for_name(const char* c) { standby_for_name.assign(c); }
+  void set_standby_for_fscid(fs_cluster_id_t f) { standby_for_fscid = f; }
 
   const map<string, string>& get_sys_info() const { return sys_info; }
   void set_sys_info(const map<string, string>& i) { sys_info = i; }
@@ -197,6 +201,7 @@ public:
       ::encode(sys_info, payload);
     }
     ::encode(mds_features, payload);
+    ::encode(standby_for_fscid, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
@@ -220,6 +225,9 @@ public:
     if (header.version >= 5) {
       ::decode(mds_features, p);
     }
+    if (header.version >= 6) {
+      ::decode(standby_for_fscid, p);
+    }
   }
 };
 
diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h
index fc087fc..d04955c 100755
--- a/src/messages/MOSDOp.h
+++ b/src/messages/MOSDOp.h
@@ -19,6 +19,7 @@
 #include "msg/Message.h"
 #include "osd/osd_types.h"
 #include "include/ceph_features.h"
+#include <atomic>
 
 /*
  * OSD op
@@ -48,8 +49,10 @@ private:
   pg_t pgid;
   bufferlist::iterator p;
   // Decoding flags. Decoding is only needed for messages catched by pipe reader.
-  bool partial_decode_needed;
-  bool final_decode_needed;
+  // Transition from true -> false without locks being held
+  // Can never see final_decode_needed == false and partial_decode_needed == true
+  atomic<bool> partial_decode_needed;
+  atomic<bool> final_decode_needed;
   //
 public:
   vector<OSDOp> ops;
@@ -315,7 +318,13 @@ struct ceph_osd_request_head {
 
       ::encode(retry_attempt, payload);
       ::encode(features, payload);
-      ::encode(reqid, payload);
+      if (reqid.name != entity_name_t() || reqid.tid != 0) {
+	::encode(reqid, payload);
+      } else {
+	// don't include client_inc in the reqid for the legacy v6
+	// encoding or else we'll confuse older peers.
+	::encode(osd_reqid_t(), payload);
+      }
     } else {
       // new, reordered, v7 message encoding
       header.version = HEAD_VERSION;
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 92322e0..c5ed1a2 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -139,18 +139,20 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap)
   assert(version >= fsmap.epoch);
 
   // read and decode
-  mdsmap_bl.clear();
-  int err = get_version(version, mdsmap_bl);
+  fsmap_bl.clear();
+  int err = get_version(version, fsmap_bl);
   assert(err == 0);
 
-  assert(mdsmap_bl.length() > 0);
+  assert(fsmap_bl.length() > 0);
   dout(10) << __func__ << " got " << version << dendl;
-  fsmap.decode(mdsmap_bl);
+  fsmap.decode(fsmap_bl);
 
   // new map
   dout(4) << "new map" << dendl;
   print_map(fsmap, 0);
-  fsmap.sanity();
+  if (!g_conf->mon_mds_skip_sanity) {
+    fsmap.sanity();
+  }
 
   check_subs();
   update_logger();
@@ -176,7 +178,9 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 
   // print map iff 'debug mon = 30' or higher
   print_map(pending_fsmap, 30);
-  pending_fsmap.sanity();
+  if (!g_conf->mon_mds_skip_sanity) {
+    pending_fsmap.sanity();
+  }
 
   // Set 'modified' on maps modified this epoch
   for (auto &i : fsmap.filesystems) {
@@ -187,11 +191,11 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 
   // apply to paxos
   assert(get_last_committed() + 1 == pending_fsmap.epoch);
-  bufferlist mdsmap_bl;
-  pending_fsmap.encode(mdsmap_bl, mon->get_quorum_features());
+  bufferlist fsmap_bl;
+  pending_fsmap.encode(fsmap_bl, mon->get_quorum_features());
 
   /* put everything in the transaction */
-  put_version(t, pending_fsmap.epoch, mdsmap_bl);
+  put_version(t, pending_fsmap.epoch, fsmap_bl);
   put_last_committed(t, pending_fsmap.epoch);
 
   // Encode MDSHealth data
@@ -520,6 +524,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
       new_info.state_seq = seq;
       new_info.standby_for_rank = m->get_standby_for_rank();
       new_info.standby_for_name = m->get_standby_for_name();
+      new_info.standby_for_fscid = m->get_standby_for_fscid();
       pending_fsmap.insert(new_info);
     }
 
@@ -536,7 +541,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
         pending_fsmap.modify_daemon(gid, [fscid, leaderinfo, followable](
               MDSMap::mds_info_t *info) {
             info->standby_for_rank = leaderinfo->rank;
-            info->standby_for_ns = fscid;
+            info->standby_for_fscid = fscid;
         });
       }
     }
@@ -607,7 +612,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
           pending_fsmap.modify_daemon(info.global_id,
               [target_info, target_ns, seq](MDSMap::mds_info_t *info) {
             info->standby_for_rank = target_info->rank;
-            info->standby_for_ns = target_ns;
+            info->standby_for_fscid = target_ns;
             info->state = MDSMap::STATE_STANDBY_REPLAY;
             info->state_seq = seq;
           });
@@ -616,14 +621,11 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
           return false;
         }
       } else if (m->get_standby_for_rank() >= 0) {
-        // TODO get this from MDS message
-        // >>
-        fs_cluster_id_t target_ns = FS_CLUSTER_ID_NONE;
-        // <<
+        fs_cluster_id_t target_ns = m->get_standby_for_fscid();
 
         mds_role_t target_role = {
           target_ns == FS_CLUSTER_ID_NONE ?
-            pending_fsmap.legacy_client_fscid : info.standby_for_ns,
+            pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
           m->get_standby_for_rank()};
 
         if (target_role.fscid != FS_CLUSTER_ID_NONE) {
@@ -632,7 +634,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
             pending_fsmap.modify_daemon(info.global_id,
                 [target_role, seq](MDSMap::mds_info_t *info) {
               info->standby_for_rank = target_role.rank;
-              info->standby_for_ns = target_role.fscid;
+              info->standby_for_fscid = target_role.fscid;
               info->state = MDSMap::STATE_STANDBY_REPLAY;
               info->state_seq = seq;
             });
@@ -2110,7 +2112,6 @@ int MDSMonitor::filesystem_command(
 {
   dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
   op->mark_mdsmon_event(__func__);
-  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = 0;
   string whostr;
   cmd_getval(g_ceph_context, cmdmap, "who", whostr);
@@ -2403,7 +2404,7 @@ void MDSMonitor::check_sub(Subscription *sub)
 
   if (sub->type == "fsmap") {
     if (sub->next <= fsmap.get_epoch()) {
-      sub->session->con->send_message(new MFSMap(mon->monmap->fsid, &fsmap));
+      sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
       if (sub->onetime) {
         mon->session_map.remove_sub(sub);
       } else {
@@ -2762,8 +2763,8 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
         // The mds_info_t may or may not tell us exactly which filesystem
         // the standby_for_rank refers to: lookup via legacy_client_fscid
         mds_role_t target_role = {
-          info.standby_for_ns == FS_CLUSTER_ID_NONE ?
-            pending_fsmap.legacy_client_fscid : info.standby_for_ns,
+          info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
+            pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
           info.standby_for_rank};
 
         // If we managed to resolve a full target role
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index 2f4b193..419deaa 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -42,8 +42,8 @@ class FileSystemCommandHandler;
 class MDSMonitor : public PaxosService {
  public:
   // mds maps
-  FSMap fsmap;          // current
-  bufferlist mdsmap_bl;   // encoded
+  FSMap fsmap;           // current
+  bufferlist fsmap_bl;   // encoded
 
   FSMap pending_fsmap;  // current + pending updates
 
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index dc1ec6c..f906406 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -426,6 +426,11 @@ void MonClient::shutdown()
     waiting_for_session.pop_front();
   }
 
+  if (cur_con)
+    cur_con->mark_down();
+  cur_con.reset(NULL);
+  cur_mon.clear();
+
   monc_lock.Unlock();
 
   if (initialized) {
@@ -434,11 +439,6 @@ void MonClient::shutdown()
   monc_lock.Lock();
   timer.shutdown();
 
-  if (cur_con)
-    cur_con->mark_down();
-  cur_con.reset(NULL);
-  cur_mon.clear();
-
   monc_lock.Unlock();
 }
 
@@ -534,6 +534,7 @@ void MonClient::handle_auth(MAuthReply *m)
   if (ret == 0) {
     if (state != MC_STATE_HAVE_SESSION) {
       state = MC_STATE_HAVE_SESSION;
+      last_rotating_renew_sent = utime_t();
       while (!waiting_for_session.empty()) {
 	_send_mon_message(waiting_for_session.front());
 	waiting_for_session.pop_front();
@@ -831,8 +832,11 @@ int MonClient::_check_auth_rotating()
     return 0;
   }
 
-  utime_t cutoff = ceph_clock_now(cct);
+  utime_t now = ceph_clock_now(cct);
+  utime_t cutoff = now;
   cutoff -= MIN(30.0, cct->_conf->auth_service_ticket_ttl / 4.0);
+  utime_t issued_at_lower_bound = now;
+  issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl;
   if (!rotating_secrets->need_new_secrets(cutoff)) {
     ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl;
     rotating_secrets->dump_rotating();
@@ -840,9 +844,22 @@ int MonClient::_check_auth_rotating()
   }
 
   ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl;
+  if (!rotating_secrets->need_new_secrets() &&
+      rotating_secrets->need_new_secrets(issued_at_lower_bound)) {
+    // the key has expired before it has been issued?
+    lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early"
+               << " (before " << issued_at_lower_bound << ")" << dendl;
+  }
+  if ((now > last_rotating_renew_sent) &&
+      double(now - last_rotating_renew_sent) < 1) {
+    ldout(cct, 10) << __func__ << " called too often (last: "
+                   << last_rotating_renew_sent << "), skipping refresh" << dendl;
+    return 0;
+  }
   MAuth *m = new MAuth;
   m->protocol = auth->get_protocol();
   if (auth->build_rotating_request(m->auth_payload)) {
+    last_rotating_renew_sent = now;
     _send_mon_message(m);
   } else {
     m->put();
@@ -853,7 +870,8 @@ int MonClient::_check_auth_rotating()
 int MonClient::wait_auth_rotating(double timeout)
 {
   Mutex::Locker l(monc_lock);
-  utime_t until = ceph_clock_now(cct);
+  utime_t now = ceph_clock_now(cct);
+  utime_t until = now;
   until += timeout;
 
   if (auth->get_protocol() == CEPH_AUTH_NONE)
@@ -863,14 +881,14 @@ int MonClient::wait_auth_rotating(double timeout)
     return 0;
 
   while (auth_principal_needs_rotating_keys(entity_name) &&
-	 rotating_secrets->need_new_secrets()) {
-    utime_t now = ceph_clock_now(cct);
+	 rotating_secrets->need_new_secrets(now)) {
     if (now >= until) {
       ldout(cct, 0) << "wait_auth_rotating timed out after " << timeout << dendl;
       return -ETIMEDOUT;
     }
     ldout(cct, 10) << "wait_auth_rotating waiting (until " << until << ")" << dendl;
     auth_cond.WaitUntil(monc_lock, until);
+    now = ceph_clock_now(cct);
   }
   ldout(cct, 10) << "wait_auth_rotating done" << dendl;
   return 0;
diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h
index d98a15b..2efa1ff 100644
--- a/src/mon/MonClient.h
+++ b/src/mon/MonClient.h
@@ -179,6 +179,7 @@ private:
   int authenticate_err;
 
   list<Message*> waiting_for_session;
+  utime_t last_rotating_renew_sent;
   Context *session_established_context;
   bool had_a_connection;
   double reopen_interval_multiplier;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index a003e3a..c7e923f 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -714,11 +714,11 @@ COMMAND("osd pool rename " \
 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
 	"name=val,type=CephString " \
 	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
@@ -795,7 +795,8 @@ COMMAND("osd tier rm " \
 	"osd", "rw", "cli,rest")
 COMMAND("osd tier cache-mode " \
 	"name=pool,type=CephPoolname " \
-	"name=mode,type=CephChoices,strings=none|writeback|forward|readonly|readforward|readproxy", \
+	"name=mode,type=CephChoices,strings=none|writeback|forward|readonly|readforward|proxy|readproxy " \
+	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"specify the caching mode for cache tier <pool>", "osd", "rw", "cli,rest")
 COMMAND("osd tier set-overlay " \
 	"name=pool,type=CephPoolname " \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 760d80a..9f9bf59 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -333,7 +333,7 @@ void Monitor::do_admin_command(string command, cmdmap_t& cmdmap, string format,
     elector.stop_participating();
     ss << "stopped responding to quorum, initiated new election";
   } else if (command == "ops") {
-    op_tracker.dump_ops_in_flight(f.get());
+    (void)op_tracker.dump_ops_in_flight(f.get());
     if (f) {
       f->flush(ss);
     }
@@ -2474,7 +2474,7 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
     ss << "            election epoch " << get_epoch()
        << ", quorum " << get_quorum() << " " << get_quorum_names() << "\n";
     if (mdsmon()->fsmap.any_filesystems()) {
-      ss << "     mdsmap " << mdsmon()->fsmap << "\n";
+      ss << "      fsmap " << mdsmon()->fsmap << "\n";
     }
 
     osdmon()->osdmap.print_summary(NULL, ss);
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 4051132..8114154 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -3022,7 +3022,7 @@ namespace {
     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
-    RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY};
+    RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY};
 
   std::set<osd_pool_get_choices>
     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -3512,7 +3512,8 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       ("scrub_max_interval", SCRUB_MAX_INTERVAL)
       ("deep_scrub_interval", DEEP_SCRUB_INTERVAL)
       ("recovery_priority", RECOVERY_PRIORITY)
-      ("recovery_op_priority", RECOVERY_OP_PRIORITY);
+      ("recovery_op_priority", RECOVERY_OP_PRIORITY)
+      ("scrub_priority", SCRUB_PRIORITY);
 
     typedef std::set<osd_pool_get_choices> choices_set_t;
 
@@ -3696,6 +3697,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 	  case DEEP_SCRUB_INTERVAL:
           case RECOVERY_PRIORITY:
           case RECOVERY_OP_PRIORITY:
+          case SCRUB_PRIORITY:
 	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
 	      if (i->second == *it)
 		break;
@@ -3829,6 +3831,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 	  case DEEP_SCRUB_INTERVAL:
           case RECOVERY_PRIORITY:
           case RECOVERY_OP_PRIORITY:
+          case SCRUB_PRIORITY:
 	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
 	      if (i->second == *it)
 		break;
@@ -7296,6 +7299,19 @@ done:
       goto reply;
     }
 
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	 mode != pg_pool_t::CACHEMODE_NONE &&
+	 mode != pg_pool_t::CACHEMODE_PROXY &&
+	 mode != pg_pool_t::CACHEMODE_READPROXY) &&
+	sure != "--yes-i-really-mean-it") {
+      ss << "'" << modestr << "' is not a well-supported cache mode and may "
+	 << "corrupt your data.  pass --yes-i-really-mean-it to force.";
+      err = -EPERM;
+      goto reply;
+    }
+
     // pool already has this cache-mode set and there are no pending changes
     if (p->cache_mode == mode &&
 	(pending_inc.new_pools.count(pool_id) == 0 ||
@@ -7313,15 +7329,17 @@ done:
      *  writeback:  Cache writes, promote reads from base pool
      *  readonly:   Forward writes to base pool
      *  readforward: Writes are in writeback mode, Reads are in forward mode
+     *  proxy:       Proxy all reads and writes to base pool
      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
      *
      * Hence, these are the allowed transitions:
      *
      *  none -> any
-     *  forward -> readforward || readproxy || writeback || any IF num_objects_dirty == 0
-     *  readforward -> forward || readproxy || writeback || any IF num_objects_dirty == 0
-     *  readproxy -> forward || readforward || writeback || any IF num_objects_dirty == 0
-     *  writeback -> readforward || readproxy || forward
+     *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
+     *  proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
+     *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
+     *  readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
+     *  writeback -> readforward || readproxy || forward || proxy
      *  readonly -> any
      */
 
@@ -7331,6 +7349,7 @@ done:
 
     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
         (mode != pg_pool_t::CACHEMODE_FORWARD &&
+	  mode != pg_pool_t::CACHEMODE_PROXY &&
 	  mode != pg_pool_t::CACHEMODE_READFORWARD &&
 	  mode != pg_pool_t::CACHEMODE_READPROXY)) {
       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
@@ -7338,6 +7357,8 @@ done:
          << "' pool; only '"
          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
 	 << "','"
+         << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
+	 << "','"
          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
 	 << "','"
          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
@@ -7348,16 +7369,25 @@ done:
     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
 	  mode != pg_pool_t::CACHEMODE_FORWARD &&
+	  mode != pg_pool_t::CACHEMODE_PROXY &&
 	  mode != pg_pool_t::CACHEMODE_READPROXY)) ||
 
         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
 	  mode != pg_pool_t::CACHEMODE_FORWARD &&
-	  mode != pg_pool_t::CACHEMODE_READFORWARD)) ||
+	  mode != pg_pool_t::CACHEMODE_READFORWARD &&
+	  mode != pg_pool_t::CACHEMODE_PROXY)) ||
+
+        (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
+        (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	  mode != pg_pool_t::CACHEMODE_FORWARD &&
+	  mode != pg_pool_t::CACHEMODE_READFORWARD &&
+	  mode != pg_pool_t::CACHEMODE_READPROXY)) ||
 
         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
 	  mode != pg_pool_t::CACHEMODE_READFORWARD &&
+	  mode != pg_pool_t::CACHEMODE_PROXY &&
 	  mode != pg_pool_t::CACHEMODE_READPROXY))) {
 
       const pool_stat_t& tier_stats =
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 8116201..fbef9ba 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -468,7 +468,8 @@ void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
   num_pg++;
   num_pg_by_state[s.state]++;
 
-  if (s.state & PG_STATE_CREATING) {
+  if ((s.state & PG_STATE_CREATING) &&
+      s.parent_split_bits == 0) {
     creating_pgs.insert(pgid);
     if (s.acting_primary >= 0) {
       creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
@@ -505,7 +506,8 @@ void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
   if (end == 0)
     num_pg_by_state.erase(s.state);
 
-  if (s.state & PG_STATE_CREATING) {
+  if ((s.state & PG_STATE_CREATING) &&
+      s.parent_split_bits == 0) {
     creating_pgs.erase(pgid);
     if (s.acting_primary >= 0) {
       map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index eadba02..8b4c44b 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -125,18 +125,6 @@ void PGMonitor::tick()
 
   handle_osd_timeouts();
 
-  if (mon->is_leader()) {
-    bool propose = false;
-
-    if ((need_check_down_pgs || !need_check_down_pg_osds.empty()) &&
-        check_down_pgs())
-      propose = true;
-
-    if (propose) {
-      propose_pending();
-    }
-  }
-
   if (!pg_map.pg_sum_deltas.empty()) {
     utime_t age = ceph_clock_now(g_ceph_context) - pg_map.stamp;
     if (age > 2 * g_conf->mon_delta_reset_interval) {
@@ -1171,27 +1159,35 @@ bool PGMonitor::map_pg_creates()
         up_primary != s->up_primary ||
         acting !=  s->acting ||
         acting_primary != s->acting_primary) {
-      dout(20) << __func__ << "  " << pgid << " "
-               << " acting_primary: " << s->acting_primary
-               << " -> " << acting_primary
-               << " acting: " << s->acting << " -> " << acting
-               << " up_primary: " << s->up_primary << " -> " << up_primary
-               << " up: " << s->up << " -> " << up
-               << dendl;
-
       pg_stat_t *ns = &pending_inc.pg_stat_updates[pgid];
-      *ns = *s;
-
-      // note epoch if the target of the create message changed
-      if (acting_primary != ns->acting_primary)
-	ns->mapping_epoch = osdmap->get_epoch();
-
-      ns->up = up;
-      ns->up_primary = up_primary;
-      ns->acting = acting;
-      ns->acting_primary = acting_primary;
-
-      ++changed;
+      if (osdmap->get_epoch() > ns->reported_epoch) {
+	dout(20) << __func__ << "  " << pgid << " "
+		 << " acting_primary: " << s->acting_primary
+		 << " -> " << acting_primary
+		 << " acting: " << s->acting << " -> " << acting
+		 << " up_primary: " << s->up_primary << " -> " << up_primary
+		 << " up: " << s->up << " -> " << up
+		 << dendl;
+
+	// only initialize if it wasn't already a pending update
+	if (ns->reported_epoch == 0)
+	  *ns = *s;
+
+	// note epoch if the target of the create message changed
+	if (acting_primary != ns->acting_primary)
+	  ns->mapping_epoch = osdmap->get_epoch();
+
+	ns->up = up;
+	ns->up_primary = up_primary;
+	ns->acting = acting;
+	ns->acting_primary = acting_primary;
+
+	++changed;
+      } else {
+	dout(20) << __func__ << "  " << pgid << " has pending update from newer"
+		 << " epoch " << ns->reported_epoch
+		 << dendl;
+      }
     }
   }
   if (changed) {
@@ -1256,11 +1252,12 @@ epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
              << dendl;
     last = q->first;
     for (set<pg_t>::iterator r = q->second.begin(); r != q->second.end(); ++r) {
+      pg_stat_t &st = pg_map.pg_stat[*r];
       if (!m)
 	m = new MOSDPGCreate(pg_map.last_osdmap_epoch);
-      m->mkpg[*r] = pg_create_t(pg_map.pg_stat[*r].created,
-                                pg_map.pg_stat[*r].parent,
-                                pg_map.pg_stat[*r].parent_split_bits);
+      m->mkpg[*r] = pg_create_t(st.created,
+                                st.parent,
+                                st.parent_split_bits);
       // Need the create time from the monitor using its clock to set
       // last_scrub_stamp upon pg creation.
       m->ctimes[*r] = pg_map.pg_stat[*r].last_scrub_stamp;
@@ -1285,7 +1282,7 @@ epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
 }
 
 void PGMonitor::_try_mark_pg_stale(
-  OSDMap *osdmap,
+  const OSDMap *osdmap,
   pg_t pgid,
   const pg_stat_t& cur_stat)
 {
@@ -1310,9 +1307,19 @@ void PGMonitor::_try_mark_pg_stale(
 
 bool PGMonitor::check_down_pgs()
 {
-  dout(10) << "check_down_pgs" << dendl;
+  dout(10) << "check_down_pgs last_osdmap_epoch "
+	   << pg_map.last_osdmap_epoch << dendl;
+  if (pg_map.last_osdmap_epoch == 0)
+    return false;
+
+  // use the OSDMap that matches the one pg_map has consumed.
+  std::unique_ptr<OSDMap> osdmap;
+  bufferlist bl;
+  int err = mon->osdmon()->get_version_full(pg_map.last_osdmap_epoch, bl);
+  assert(err == 0);
+  osdmap.reset(new OSDMap);
+  osdmap->decode(bl);
 
-  OSDMap *osdmap = &mon->osdmon()->osdmap;
   bool ret = false;
 
   // if a large number of osds changed state, just iterate over the whole
@@ -1326,7 +1333,7 @@ bool PGMonitor::check_down_pgs()
       if ((p.second.state & PG_STATE_STALE) == 0 &&
           p.second.acting_primary != -1 &&
           osdmap->is_down(p.second.acting_primary)) {
-	_try_mark_pg_stale(osdmap, p.first, p.second);
+	_try_mark_pg_stale(osdmap.get(), p.first, p.second);
 	ret = true;
       }
     }
@@ -1335,9 +1342,9 @@ bool PGMonitor::check_down_pgs()
       if (osdmap->is_down(osd)) {
 	for (auto pgid : pg_map.pg_by_osd[osd]) {
 	  const pg_stat_t &stat = pg_map.pg_stat[pgid];
-	  if ((stat.state & PG_STATE_STALE) == 0 &&
-	      stat.acting_primary != -1) {
-	    _try_mark_pg_stale(osdmap, pgid, stat);
+	  assert(stat.acting_primary == osd);
+	  if ((stat.state & PG_STATE_STALE) == 0) {
+	    _try_mark_pg_stale(osdmap.get(), pgid, stat);
 	    ret = true;
 	  }
 	}
@@ -1362,7 +1369,7 @@ inline string percentify(const float& a) {
 //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
 void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
                                      object_stat_sum_t &sum, uint64_t avail,
-                                     float raw_used_rate, bool verbose) const
+                                     float raw_used_rate, bool verbose, const pg_pool_t *pool) const
 {
   float curr_object_copies_rate = 0.0;
   if (sum.num_object_copies > 0)
@@ -1374,6 +1381,8 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
     f->dump_unsigned("max_avail", avail);
     f->dump_int("objects", sum.num_objects);
     if (verbose) {
+      f->dump_int("quota_objects", pool->quota_max_objects);
+      f->dump_int("quota_bytes", pool->quota_max_bytes);
       f->dump_int("dirty", sum.num_objects_dirty);
       f->dump_int("rd", sum.num_rd);
       f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
@@ -1441,8 +1450,12 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
   } else {
     tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
     tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
-    if (verbose)
+    if (verbose) {
       tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
+    }
+
     tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
     tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
     tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
@@ -1512,10 +1525,22 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
     } else {
       tbl << pool_name
           << pool_id;
-      if (verbose)
+      if (verbose) {
 	tbl << "-";
+
+        if (pool->quota_max_objects == 0)
+          tbl << "N/A";
+        else
+          tbl << si_t(pool->quota_max_objects);
+
+        if (pool->quota_max_bytes == 0)
+          tbl << "N/A";
+        else
+          tbl << si_t(pool->quota_max_bytes);
+      }
+
     }
-    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose);
+    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
     if (f)
       f->close_section();  // stats
     else
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index d3351f3..e4081f1 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -141,7 +141,8 @@ private:
    * @return true if we updated pending_inc (and should propose)
    */
   bool check_down_pgs();
-  void _try_mark_pg_stale(OSDMap *osdmap, pg_t pgid, const pg_stat_t& cur_stat);
+  void _try_mark_pg_stale(const OSDMap *osdmap, pg_t pgid,
+			  const pg_stat_t& cur_stat);
 
 
   /**
@@ -157,7 +158,7 @@ private:
 			    object_stat_sum_t &sum,
 			    uint64_t avail,
 			    float raw_used_rate,
-			    bool verbose) const;
+			    bool verbose, const pg_pool_t *pool) const;
 
   int64_t get_rule_avail(OSDMap& osdmap, int ruleno) const;
 
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 52b0bda..5835579 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -425,26 +425,9 @@ bool Paxos::store_state(MMonPaxos *m)
     changed = true;
   }
 
-  remove_legacy_versions();
-
   return changed;
 }
 
-void Paxos::remove_legacy_versions()
-{
-  if (get_store()->exists(get_name(), "conversion_first")) {
-    MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
-    version_t v = get_store()->get(get_name(), "conversion_first");
-    dout(10) << __func__ << " removing pre-conversion paxos states from " << v
-	     << " until " << first_committed << dendl;
-    for (; v < first_committed; ++v) {
-      t->erase(get_name(), v);
-    }
-    t->erase(get_name(), "conversion_first");
-    get_store()->apply_transaction(t);
-  }
-}
-
 void Paxos::_sanity_check_store()
 {
   version_t lc = get_store()->get(get_name(), "last_committed");
@@ -901,8 +884,6 @@ void Paxos::commit_finish()
   // get ready for a new round.
   new_value.clear();
 
-  remove_legacy_versions();
-
   // WRITING -> REFRESH
   // among other things, this lets do_refresh() -> mon->bootstrap() know
   // it doesn't need to flush the store queue
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 052cd76..3f2a633 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -1188,11 +1188,6 @@ public:
   void _sanity_check_store();
 
   /**
-   * remove legacy paxos versions from before conversion
-   */
-  void remove_legacy_versions();
-
-  /**
    * Helper function to decode a bufferlist into a transaction and append it
    * to another transaction.
    *
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index ab169b8..cde3faa 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -144,26 +144,6 @@ void PaxosService::post_refresh()
   }
 }
 
-void PaxosService::remove_legacy_versions()
-{
-  dout(10) << __func__ << dendl;
-  if (!mon->store->exists(get_service_name(), "conversion_first"))
-    return;
-
-  version_t cf = mon->store->get(get_service_name(), "conversion_first");
-  version_t fc = get_first_committed();
-
-  dout(10) << __func__ << " conversion_first " << cf
-	   << " first committed " << fc << dendl;
-
-  MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
-  if (cf < fc) {
-    trim(t, cf, fc);
-  }
-  t->erase(get_service_name(), "conversion_first");
-  mon->store->apply_transaction(t);
-}
-
 bool PaxosService::should_propose(double& delay)
 {
   // simple default policy: quick startup, then some damping.
@@ -278,8 +258,6 @@ void PaxosService::_active()
   }
   dout(10) << "_active" << dendl;
 
-  remove_legacy_versions();
-
   // create pending state?
   if (mon->is_leader() && is_active()) {
     dout(7) << "_active creating new pending" << dendl;
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 77c6225..2d2cbe7 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -257,11 +257,6 @@ private:
    *	   active
    */
   void _active();
-  /**
-   * Scrub our versions after we convert the store from the old layout to
-   * the new k/v store.
-   */
-  void remove_legacy_versions();
 
 public:
   /**
@@ -343,8 +338,6 @@ public:
   /**
    * Query the Paxos system for the latest state and apply it if it's newer
    * than the current Monitor state.
-   *
-   * @returns 'true' on success; 'false' otherwise.
    */
   virtual void update_from_paxos(bool *need_bootstrap) = 0;
 
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
index 0fec5f3..98cc86f 100644
--- a/src/msg/Messenger.cc
+++ b/src/msg/Messenger.cc
@@ -31,8 +31,7 @@ Messenger *Messenger::create(CephContext *cct, const string &type,
   }
   if (r == 0 || type == "simple")
     return new SimpleMessenger(cct, name, lname, nonce, features);
-  else if ((r == 1 || type == "async") &&
-	   cct->check_experimental_feature_enabled("ms-type-async"))
+  else if (r == 1 || type == "async")
     return new AsyncMessenger(cct, name, lname, nonce, features);
 #ifdef HAVE_XIO
   else if ((type == "xio") &&
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
index 561d9f5..611e498 100644
--- a/src/msg/async/AsyncConnection.cc
+++ b/src/msg/async/AsyncConnection.cc
@@ -669,9 +669,9 @@ void AsyncConnection::process()
                                        << policy.throttler_messages->get_current() << "/"
                                        << policy.throttler_messages->get_max() << dendl;
             if (!policy.throttler_messages->get_or_fail()) {
-              ldout(async_msgr->cct, 1) << __func__ << " wants 1 message from policy throttle "
-                                        << policy.throttler_messages->get_current() << "/"
-                                        << policy.throttler_messages->get_max() << " failed, just wait." << dendl;
+              ldout(async_msgr->cct, 10) << __func__ << " wants 1 message from policy throttle "
+					 << policy.throttler_messages->get_current() << "/"
+					 << policy.throttler_messages->get_max() << " failed, just wait." << dendl;
               // following thread pool deal with th full message queue isn't a
               // short time, so we can wait a ms.
               if (register_time_events.empty())
diff --git a/src/msg/xio/XioMessenger.cc b/src/msg/xio/XioMessenger.cc
index b320867..e3afde0 100644
--- a/src/msg/xio/XioMessenger.cc
+++ b/src/msg/xio/XioMessenger.cc
@@ -248,11 +248,122 @@ static string xio_uri_from_entity(const string &type,
   return xio_uri;
 } /* xio_uri_from_entity */
 
+void XioInit::package_init(CephContext *cct) {
+   if (! initialized.read()) {
+
+     mtx.Lock();
+     if (! initialized.read()) {
+
+       xio_init();
+
+       // claim a reference to the first context we see
+       xio_log::context = cct->get();
+
+       int xopt;
+       xopt = xio_log::get_level();
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_LEVEL,
+ 		  &xopt, sizeof(xopt));
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_FN,
+ 		  (const void*)xio_log::log_dout, sizeof(xio_log_fn));
+
+       xopt = 1;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_DISABLE_HUGETBL,
+ 		  &xopt, sizeof(xopt));
+
+       if (g_code_env == CODE_ENVIRONMENT_DAEMON) {
+         xopt = 1;
+         xio_set_opt(NULL, XIO_OPTLEVEL_RDMA, XIO_OPTNAME_ENABLE_FORK_INIT,
+ 		    &xopt, sizeof(xopt));
+       }
+
+       xopt = XIO_MSGR_IOVLEN;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_IN_IOVLEN,
+ 		  &xopt, sizeof(xopt));
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_OUT_IOVLEN,
+ 		  &xopt, sizeof(xopt));
+
+       /* enable flow-control */
+       xopt = 1;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_ENABLE_FLOW_CONTROL,
+                  &xopt, sizeof(xopt));
+
+       /* and set threshold for buffer callouts */
+       xopt = max(cct->_conf->xio_max_send_inline, 512);
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_DATA,
+                  &xopt, sizeof(xopt));
+       xopt = 216;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_HEADER,
+                  &xopt, sizeof(xopt));
+
+       size_t queue_depth = cct->_conf->xio_queue_depth;
+       struct xio_mempool_config mempool_config = {
+         6,
+         {
+           {1024,  0,  queue_depth,  262144},
+           {4096,  0,  queue_depth,  262144},
+           {16384, 0,  queue_depth,  262144},
+           {65536, 0,  128,  65536},
+           {262144, 0,  32,  16384},
+           {1048576, 0, 8,  8192}
+         }
+       };
+       xio_set_opt(NULL,
+                   XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_CONFIG_MEMPOOL,
+                   &mempool_config, sizeof(mempool_config));
+
+       /* and unregisterd one */
+ #define XMSG_MEMPOOL_QUANTUM 4096
+
+       xio_msgr_noreg_mpool =
+ 	xio_mempool_create(-1 /* nodeid */,
+ 			   XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC);
+
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 64,
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_64,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 256,
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_256,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 1024,
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_1k,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, getpagesize(),
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_page,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+
+       /* initialize ops singleton */
+       xio_msgr_ops.on_session_event = on_session_event;
+       xio_msgr_ops.on_new_session = on_new_session;
+       xio_msgr_ops.on_session_established = NULL;
+       xio_msgr_ops.on_msg = on_msg;
+       xio_msgr_ops.on_ow_msg_send_complete = on_ow_msg_send_complete;
+       xio_msgr_ops.on_msg_error = on_msg_error;
+       xio_msgr_ops.on_cancel = on_cancel;
+       xio_msgr_ops.on_cancel_request = on_cancel_request;
+
+       /* mark initialized */
+       initialized.set(1);
+     }
+     mtx.Unlock();
+   }
+ }
+
 /* XioMessenger */
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, XioMessenger *msgr) {
+  return *_dout << "-- " << msgr->get_myaddr() << " ";
+}
+
 XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
 			   string mname, uint64_t _nonce, uint64_t features,
 			   DispatchStrategy *ds)
   : SimplePolicyMessenger(cct, name, mname, _nonce),
+    XioInit(cct),
     nsessions(0),
     shutdown_called(false),
     portals(this, cct->_conf->xio_portal_threads),
@@ -272,109 +383,6 @@ XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
   XioPool::trace_mempool = (cct->_conf->xio_trace_mempool);
   XioPool::trace_msgcnt = (cct->_conf->xio_trace_msgcnt);
 
-  /* package init */
-  if (! initialized.read()) {
-
-    mtx.Lock();
-    if (! initialized.read()) {
-
-      xio_init();
-
-      // claim a reference to the first context we see
-      xio_log::context = cct->get();
-
-      int xopt;
-      xopt = xio_log::get_level();
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_LEVEL,
-		  &xopt, sizeof(xopt));
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_FN,
-		  (const void*)xio_log::log_dout, sizeof(xio_log_fn));
-
-      xopt = 1;
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_DISABLE_HUGETBL,
-		  &xopt, sizeof(xopt));
-
-      if (g_code_env == CODE_ENVIRONMENT_DAEMON) {
-        xopt = 1;
-        xio_set_opt(NULL, XIO_OPTLEVEL_RDMA, XIO_OPTNAME_ENABLE_FORK_INIT,
-		    &xopt, sizeof(xopt));
-      }
-
-      xopt = XIO_MSGR_IOVLEN;
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_IN_IOVLEN,
-		  &xopt, sizeof(xopt));
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_OUT_IOVLEN,
-		  &xopt, sizeof(xopt));
-
-      /* enable flow-control */
-      xopt = 1;
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_ENABLE_FLOW_CONTROL,
-                 &xopt, sizeof(xopt));
-
-      /* and set threshold for buffer callouts */
-      xopt = max(cct->_conf->xio_max_send_inline, 512);
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_DATA,
-                 &xopt, sizeof(xopt));
-      xopt = 216;
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_HEADER,
-                 &xopt, sizeof(xopt));
-
-      size_t queue_depth = cct->_conf->xio_queue_depth;
-      struct xio_mempool_config mempool_config = {
-        6,
-        {
-          {1024,  0,  queue_depth,  262144},
-          {4096,  0,  queue_depth,  262144},
-          {16384, 0,  queue_depth,  262144},
-          {65536, 0,  128,  65536},
-          {262144, 0,  32,  16384},
-          {1048576, 0, 8,  8192}
-        }
-      };
-      xio_set_opt(NULL,
-                  XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_CONFIG_MEMPOOL,
-                  &mempool_config, sizeof(mempool_config));
-
-      /* and unregisterd one */
-#define XMSG_MEMPOOL_QUANTUM 4096
-
-      xio_msgr_noreg_mpool =
-	xio_mempool_create(-1 /* nodeid */,
-			   XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC);
-
-      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 64,
-				       cct->_conf->xio_mp_min,
-				       cct->_conf->xio_mp_max_64,
-				       XMSG_MEMPOOL_QUANTUM, 0);
-      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 256,
-				       cct->_conf->xio_mp_min,
-				       cct->_conf->xio_mp_max_256,
-				       XMSG_MEMPOOL_QUANTUM, 0);
-      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 1024,
-				       cct->_conf->xio_mp_min,
-				       cct->_conf->xio_mp_max_1k,
-				       XMSG_MEMPOOL_QUANTUM, 0);
-      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, getpagesize(),
-				       cct->_conf->xio_mp_min,
-				       cct->_conf->xio_mp_max_page,
-				       XMSG_MEMPOOL_QUANTUM, 0);
-
-      /* initialize ops singleton */
-      xio_msgr_ops.on_session_event = on_session_event;
-      xio_msgr_ops.on_new_session = on_new_session;
-      xio_msgr_ops.on_session_established = NULL;
-      xio_msgr_ops.on_msg = on_msg;
-      xio_msgr_ops.on_ow_msg_send_complete = on_ow_msg_send_complete;
-      xio_msgr_ops.on_msg_error = on_msg_error;
-      xio_msgr_ops.on_cancel = on_cancel;
-      xio_msgr_ops.on_cancel_request = on_cancel_request;
-
-      /* mark initialized */
-      initialized.set(1);
-    }
-    mtx.Unlock();
-  }
-
   dispatch_strategy->set_messenger(this);
 
   /* update class instance count */
@@ -961,15 +969,11 @@ ConnectionRef XioMessenger::get_connection(const entity_inst_t& dest)
       << xio_uri << dendl;
 
     /* XXX client session creation parameters */
-    struct xio_session_params params = {
-      .type		= XIO_SESSION_CLIENT,
-      .initial_sn	= 0,
-      .ses_ops		= &xio_msgr_ops,
-      .user_context	= this,
-      .private_data	= NULL,
-      .private_data_len = 0,
-      .uri		= (char *)xio_uri.c_str()
-    };
+    struct xio_session_params params = {};
+    params.type         = XIO_SESSION_CLIENT;
+    params.ses_ops      = &xio_msgr_ops;
+    params.user_context = this;
+    params.uri          = xio_uri.c_str();
 
     XioConnection *xcon = new XioConnection(this, XioConnection::ACTIVE,
 					    dest);
@@ -982,16 +986,10 @@ ConnectionRef XioMessenger::get_connection(const entity_inst_t& dest)
 
     /* this should cause callbacks with user context of conn, but
      * we can always set it explicitly */
-    struct xio_connection_params xcp = {
-      .session           = xcon->session,
-      .ctx               = this->portals.get_portal0()->ctx,
-      .conn_idx          = 0, /* XXX auto_count */
-      .enable_tos        = 0,
-      .tos               = 0,
-      .pad               = 0,
-      .out_addr          = NULL,
-      .conn_user_context = xcon
-    };
+    struct xio_connection_params xcp = {};
+    xcp.session           = xcon->session;
+    xcp.ctx               = this->portals.get_portal0()->ctx;
+    xcp.conn_user_context = xcon;
 
     xcon->conn = xio_connect(&xcp);
     if (!xcon->conn) {
diff --git a/src/msg/xio/XioMessenger.h b/src/msg/xio/XioMessenger.h
index 81ecef1..448b815 100644
--- a/src/msg/xio/XioMessenger.h
+++ b/src/msg/xio/XioMessenger.h
@@ -28,7 +28,17 @@ extern "C" {
 #include "common/Mutex.h"
 #include "include/Spinlock.h"
 
-class XioMessenger : public SimplePolicyMessenger
+class XioInit {
+  /* safe to be called multiple times */
+  void package_init(CephContext *cct);
+
+protected:
+  XioInit(CephContext *cct) {
+    this->package_init(cct);
+  }
+};
+
+class XioMessenger : public SimplePolicyMessenger, XioInit
 {
 private:
   static atomic_t nInstances;
diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc
index 1dd05c2..d0d0827 100644
--- a/src/objclass/class_api.cc
+++ b/src/objclass/class_api.cc
@@ -269,12 +269,19 @@ int cls_cxx_stat2(cls_method_context_t hctx, uint64_t *size, ceph::real_time *mt
 
 int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, bufferlist *outbl)
 {
+  return cls_cxx_read2(hctx, ofs, len, outbl, 0);
+}
+
+int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len,
+                  bufferlist *outbl, uint32_t op_flags)
+{
   ReplicatedPG::OpContext **pctx = (ReplicatedPG::OpContext **)hctx;
   vector<OSDOp> ops(1);
   int ret;
   ops[0].op.op = CEPH_OSD_OP_SYNC_READ;
   ops[0].op.extent.offset = ofs;
   ops[0].op.extent.length = len;
+  ops[0].op.flags = op_flags;
   ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
   if (ret < 0)
     return ret;
@@ -284,11 +291,18 @@ int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, bufferlist *outbl)
 
 int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, bufferlist *inbl)
 {
+  return cls_cxx_write2(hctx, ofs, len, inbl, 0);
+}
+
+int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len,
+                   bufferlist *inbl, uint32_t op_flags)
+{
   ReplicatedPG::OpContext **pctx = (ReplicatedPG::OpContext **)hctx;
   vector<OSDOp> ops(1);
   ops[0].op.op = CEPH_OSD_OP_WRITE;
   ops[0].op.extent.offset = ofs;
   ops[0].op.extent.length = len;
+  ops[0].op.flags = op_flags;
   ops[0].indata = *inbl;
   return (*pctx)->pg->do_osd_ops(*pctx, ops);
 }
diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h
index 08a9d23..8bc3a0a 100644
--- a/src/objclass/objclass.h
+++ b/src/objclass/objclass.h
@@ -145,7 +145,11 @@ extern int cls_cxx_remove(cls_method_context_t hctx);
 extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
 extern int cls_cxx_stat2(cls_method_context_t hctx, uint64_t *size, ceph::real_time *mtime);
 extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
+extern int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len,
+                         bufferlist *bl, uint32_t op_flags);
 extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
+extern int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len,
+                          bufferlist *bl, uint32_t op_flags);
 extern int cls_cxx_write_full(cls_method_context_t hctx, bufferlist *bl);
 extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
                             bufferlist *outbl);
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index fba6b76..c561d31 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -438,7 +438,7 @@ public:
       __le32 largest_data_off_in_tbl;
       __le32 fadvise_flags;
 
-      TransactionData() :
+      TransactionData() noexcept :
         ops(0),
         largest_data_len(0),
         largest_data_off(0),
@@ -446,7 +446,7 @@ public:
 	fadvise_flags(0) { }
 
       // override default move operations to reset default values
-      TransactionData(TransactionData&& other) :
+      TransactionData(TransactionData&& other) noexcept :
         ops(other.ops),
         largest_data_len(other.largest_data_len),
         largest_data_off(other.largest_data_off),
@@ -458,7 +458,7 @@ public:
         other.largest_data_off_in_tbl = 0;
         other.fadvise_flags = 0;
       }
-      TransactionData& operator=(TransactionData&& other) {
+      TransactionData& operator=(TransactionData&& other) noexcept {
         ops = other.ops;
         largest_data_len = other.largest_data_len;
         largest_data_off = other.largest_data_off;
@@ -518,7 +518,7 @@ public:
     }
 
     // override default move operations to reset default values
-    Transaction(Transaction&& other) :
+    Transaction(Transaction&& other) noexcept :
       data(std::move(other.data)),
       osr(other.osr),
       use_tbl(other.use_tbl),
@@ -539,7 +539,7 @@ public:
       other.object_id = 0;
     }
 
-    Transaction& operator=(Transaction&& other) {
+    Transaction& operator=(Transaction&& other) noexcept {
       data = std::move(other.data);
       osr = other.osr;
       use_tbl = other.use_tbl;
@@ -630,7 +630,7 @@ public:
       return use_tbl;
     }
 
-    void swap(Transaction& other) {
+    void swap(Transaction& other) noexcept {
       std::swap(data, other.data);
       std::swap(on_applied, other.on_applied);
       std::swap(on_commit, other.on_commit);
@@ -718,9 +718,17 @@ public:
         op->dest_oid = om[op->dest_oid];
         break;
 
+      case OP_TRY_RENAME:
+        assert(op->cid < cm.size());
+        assert(op->oid < om.size());
+        assert(op->dest_oid < om.size());
+        op->cid = cm[op->cid];
+        op->oid = om[op->oid];
+        op->dest_oid = om[op->dest_oid];
+
       case OP_SPLIT_COLLECTION2:
         assert(op->cid < cm.size());
-        op->dest_cid = cm[op->dest_oid];
+	assert(op->dest_cid < cm.size());
         op->cid = cm[op->cid];
         op->dest_cid = cm[op->dest_cid];
         break;
diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc
index a72ab7c..6028104 100644
--- a/src/os/Transaction.cc
+++ b/src/os/Transaction.cc
@@ -922,6 +922,17 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
       }
       break;
 
+    case Transaction::OP_TRY_RENAME:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t old_oid = i.get_oid(op->oid);
+        ghobject_t new_oid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "op_coll_move_rename");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("old_oid") << old_oid;
+	f->dump_stream("new_oid") << new_oid;
+      }
+	
     case Transaction::OP_SETALLOCHINT:
       {
         coll_t cid = i.get_cid(op->cid);
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 52f9055..6f5710c 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -32,30 +32,6 @@
 
 #define dout_subsys ceph_subsys_bluestore
 
-/*
-
-  TODO:
-
-  * superblock, features
-  * bdev: smarter zeroing
-  * zero overlay in onode?
-  * discard
-  * aio read?
-  * read uses local ioc
-  * refcounted extents (for efficient clone)
-  * overlay does inefficient zeroing on unwritten extent
-
- */
-
-/*
- * Some invariants:
- *
- * - If the end of the object is a partial block, and is not an overlay,
- *   the remainder of that block will always be zeroed.  (It has to be written
- *   anyway, so we may as well have written zeros.)
- *
- */
-
 const string PREFIX_SUPER = "S";   // field -> value
 const string PREFIX_COLL = "C";    // collection name -> cnode_t
 const string PREFIX_OBJ = "O";     // object name -> onode_t
@@ -289,6 +265,18 @@ static void get_enode_key(shard_id_t shard, int64_t pool, uint32_t hash,
   _key_encode_u32(hobject_t::_reverse_bits(hash), key);
 }
 
+static int get_key_enode(const string& key, shard_id_t *shard,
+			 int64_t *pool, uint32_t *hash)
+{
+  const char *p = key.c_str();
+  if (key.length() < 2 + 8 + 4)
+    return -2;
+  p = _key_decode_shard(p, shard);
+  p = _key_decode_u64(p, (uint64_t*)pool);
+  p = _key_decode_u32(p, hash);
+  return 0;
+}
+
 static int get_key_object(const string& key, ghobject_t *oid);
 
 static void get_object_key(const ghobject_t& oid, string *key)
@@ -495,6 +483,7 @@ void BlueStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o)
   assert(onode_map.count(oid) == 0);
   onode_map[oid] = o;
   lru.push_front(*o);
+  _trim(max_size);
 }
 
 BlueStore::OnodeRef BlueStore::OnodeHashLRU::lookup(const ghobject_t& oid)
@@ -519,8 +508,9 @@ void BlueStore::OnodeHashLRU::clear()
   onode_map.clear();
 }
 
-void BlueStore::OnodeHashLRU::rename(const ghobject_t& old_oid,
-				    const ghobject_t& new_oid)
+void BlueStore::OnodeHashLRU::rename(OnodeRef& oldo,
+				     const ghobject_t& old_oid,
+				     const ghobject_t& new_oid)
 {
   std::lock_guard<std::mutex> l(lock);
   dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl;
@@ -539,7 +529,8 @@ void BlueStore::OnodeHashLRU::rename(const ghobject_t& old_oid,
   OnodeRef o = po->second;
 
   // install a non-existent onode at old location
-  po->second.reset(new Onode(old_oid, o->key));
+  oldo.reset(new Onode(old_oid, o->key));
+  po->second = oldo;
   lru.push_back(*po->second);
 
   // add at new position and fix oid, key
@@ -582,8 +573,15 @@ bool BlueStore::OnodeHashLRU::get_next(
 int BlueStore::OnodeHashLRU::trim(int max)
 {
   std::lock_guard<std::mutex> l(lock);
-  dout(20) << __func__ << " max " << max
-	   << " size " << onode_map.size() << dendl;
+  if (max < 0) {
+    max = max_size;
+  }
+  return _trim(max);
+}
+
+int BlueStore::OnodeHashLRU::_trim(int max)
+{
+  dout(20) << __func__ << " max " << max << " size " << onode_map.size() << dendl;
   int trimmed = 0;
   int num = onode_map.size() - max;
   if (onode_map.size() == 0 || num <= 0)
@@ -628,8 +626,8 @@ BlueStore::Collection::Collection(BlueStore *ns, coll_t c)
     cid(c),
     lock("BlueStore::Collection::lock", true, false),
     exists(true),
-    onode_map(),
-    enode_set(g_conf->bluestore_onode_map_size)
+    enode_set(g_conf->bluestore_onode_map_size),
+    onode_map(g_conf->bluestore_onode_map_size)
 {
 }
 
@@ -1463,6 +1461,14 @@ int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents,
       gift = g;
     reclaim = 0;
   }
+  if (gift) {
+    float new_bluefs_ratio = (float)(bluefs_free + gift) / (float)total_free;
+    if (new_bluefs_ratio >= g_conf->bluestore_bluefs_max_ratio) {
+      dout(10) << __func__ << " gift would push us past the max_ratio,"
+	       << " doing nothing" << dendl;
+      gift = 0;
+    }
+  }
 
   if (gift) {
     // round up to alloc size
@@ -1947,7 +1953,8 @@ int BlueStore::umount()
 
 int BlueStore::_verify_enode_shared(
   EnodeRef enode,
-  vector<bluestore_extent_t>& v)
+  vector<bluestore_extent_t>& v,
+  interval_set<uint64_t> &used_blocks)
 {
   int errors = 0;
   interval_set<uint64_t> span;
@@ -1973,6 +1980,14 @@ int BlueStore::_verify_enode_shared(
 	 << " != expected " << ref_map << dendl;
     ++errors;
   }
+  interval_set<uint64_t> i;
+  i.intersection_of(span, used_blocks);
+  if (!i.empty()) {
+    derr << " hash " << enode->hash << " extent(s) " << i
+	 << " already allocated" << dendl;
+    ++errors;
+  }
+  used_blocks.insert(span);
   return errors;
 }
 
@@ -1984,6 +1999,7 @@ int BlueStore::fsck()
   set<uint64_t> used_omap_head;
   interval_set<uint64_t> used_blocks;
   KeyValueDB::Iterator it;
+  EnodeRef enode;
   vector<bluestore_extent_t> hash_shared;
 
   int r = _open_path();
@@ -2041,7 +2057,6 @@ int BlueStore::fsck()
     CollectionRef c = _get_collection(p->first);
     RWLock::RLocker l(c->lock);
     ghobject_t pos;
-    EnodeRef enode;
     while (true) {
       vector<ghobject_t> ols;
       int r = collection_list(p->first, pos, ghobject_t::get_max(), true,
@@ -2062,7 +2077,7 @@ int BlueStore::fsck()
 	}
 	if (!enode || enode->hash != o->oid.hobj.get_hash()) {
 	  if (enode)
-	    errors += _verify_enode_shared(enode, hash_shared);
+	    errors += _verify_enode_shared(enode, hash_shared, used_blocks);
 	  enode = c->get_enode(o->oid.hobj.get_hash());
 	  hash_shared.clear();
 	}
@@ -2077,19 +2092,21 @@ int BlueStore::fsck()
 	}
 	// blocks
 	for (auto& b : o->onode.block_map) {
-	  if (b.second.has_flag(bluestore_extent_t::FLAG_SHARED))
+	  if (b.second.has_flag(bluestore_extent_t::FLAG_SHARED)) {
 	    hash_shared.push_back(b.second);
-	  if (used_blocks.intersects(b.second.offset, b.second.length)) {
-	    derr << " " << oid << " extent " << b.first << ": " << b.second
-		 << " already allocated" << dendl;
-	    ++errors;
-	    continue;
-	  }
-	  used_blocks.insert(b.second.offset, b.second.length);
-	  if (b.second.end() > bdev->get_size()) {
-	    derr << " " << oid << " extent " << b.first << ": " << b.second
-		 << " past end of block device" << dendl;
-	    ++errors;
+	  } else {
+	    if (used_blocks.intersects(b.second.offset, b.second.length)) {
+	      derr << " " << oid << " extent " << b.first << ": " << b.second
+		   << " already allocated" << dendl;
+	      ++errors;
+	      continue;
+	    }
+	    used_blocks.insert(b.second.offset, b.second.length);
+	    if (b.second.end() > bdev->get_size()) {
+	      derr << " " << oid << " extent " << b.first << ": " << b.second
+		   << " past end of block device" << dendl;
+	      ++errors;
+	    }
 	  }
 	}
 	// overlays
@@ -2202,13 +2219,32 @@ int BlueStore::fsck()
       }
     }
   }
+  if (enode) {
+    errors += _verify_enode_shared(enode, hash_shared, used_blocks);
+    hash_shared.clear();
+    enode.reset();
+  }
 
-  dout(1) << __func__ << " checking for stray objects" << dendl;
+  dout(1) << __func__ << " checking for stray enodes and onodes" << dendl;
   it = db->get_iterator(PREFIX_OBJ);
   if (it) {
     CollectionRef c;
+    bool expecting_objects = false;
+    shard_id_t expecting_shard;
+    int64_t expecting_pool;
+    uint32_t expecting_hash;
     for (it->lower_bound(string()); it->valid(); it->next()) {
       ghobject_t oid;
+      if (is_enode_key(it->key())) {
+	if (expecting_objects) {
+	  dout(30) << __func__ << "  had enode but no objects for "
+		   << std::hex << expecting_hash << std::dec << dendl;
+	  ++errors;
+	}
+	get_key_enode(it->key(), &expecting_shard, &expecting_pool,
+		      &expecting_hash);
+	continue;
+      }
       int r = get_key_object(it->key(), &oid);
       if (r < 0) {
 	dout(30) << __func__ << "  bad object key "
@@ -2216,6 +2252,14 @@ int BlueStore::fsck()
 	++errors;
 	continue;
       }
+      if (expecting_objects) {
+	if (oid.hobj.get_bitwise_key_u32() != expecting_hash) {
+	  dout(30) << __func__ << "  had enode but no objects for "
+		   << std::hex << expecting_hash << std::dec << dendl;
+	  ++errors;
+	}
+	expecting_objects = false;
+      }
       if (!c || !c->contains(oid)) {
 	c = NULL;
 	for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
@@ -2235,6 +2279,12 @@ int BlueStore::fsck()
 	}
       }
     }
+    if (expecting_objects) {
+      dout(30) << __func__ << "  had enode but no objects for "
+	       << std::hex << expecting_hash << std::dec << dendl;
+      ++errors;
+      expecting_objects = false;
+    }
   }
 
   dout(1) << __func__ << " checking for stray overlay data" << dendl;
@@ -3782,7 +3832,7 @@ void BlueStore::_osr_reap_done(OpSequencer *osr)
     }
 
     if (txc->first_collection) {
-      txc->first_collection->onode_map.trim(g_conf->bluestore_onode_map_size);
+      txc->first_collection->onode_map.trim();
     }
 
     osr->q.pop_front();
@@ -3856,38 +3906,23 @@ void BlueStore::_kv_sync_thread()
       dout(30) << __func__ << " committing txc " << kv_committing << dendl;
       dout(30) << __func__ << " wal_cleaning txc " << wal_cleaning << dendl;
 
-      // one transaction to force a sync
-      KeyValueDB::Transaction t = db->get_transaction();
+      alloc->commit_start();
 
-      // allocations and deallocations
-      interval_set<uint64_t> released;
-      for (std::deque<TransContext *>::iterator it = wal_cleaning.begin();
-	  it != wal_cleaning.end();
-	  ++it) {
-	TransContext *txc = *it;
-	if (!txc->wal_txn->released.empty()) {
-	  dout(20) << __func__ << " txc " << txc
-	    << " (post-wal) released " << txc->wal_txn->released
-	    << dendl;
-	  released.insert(txc->wal_txn->released);
-	  for (interval_set<uint64_t>::iterator p =
-	      txc->wal_txn->released.begin();
-	      p != txc->wal_txn->released.end();
-	      ++p) {
-	    dout(20) << __func__ << " release " << p.get_start()
-	      << "~" << p.get_len() << dendl;
-	    fm->release(p.get_start(), p.get_len(), t);
-	  }
+      // flush/barrier on block device
+      bdev->flush();
+
+      if (!g_conf->bluestore_sync_transaction &&
+	  !g_conf->bluestore_sync_submit_transaction) {
+	for (std::deque<TransContext *>::iterator it = kv_committing.begin();
+	     it != kv_committing.end();
+	     ++it) {
+	  _txc_update_fm((*it));
+	  db->submit_transaction((*it)->t);
 	}
       }
-      for (interval_set<uint64_t>::iterator p = released.begin();
-	   p != released.end();
-	   ++p) {
-	dout(20) << __func__ << " release " << p.get_start()
-		 << "~" << p.get_len() << dendl;
-	if (!g_conf->bluestore_debug_no_reuse_blocks)
-	  alloc->release(p.get_start(), p.get_len());
-      }
+
+      // one final transaction to force a sync
+      KeyValueDB::Transaction t = db->get_transaction();
 
       vector<bluestore_extent_t> bluefs_gift_extents;
       if (bluefs) {
@@ -3906,17 +3941,25 @@ void BlueStore::_kv_sync_thread()
 	}
       }
 
-      alloc->commit_start();
-
-      // flush/barrier on block device
-      bdev->flush();
-
-      if (!g_conf->bluestore_sync_transaction && !g_conf->bluestore_sync_submit_transaction) {
-	for (std::deque<TransContext *>::iterator it = kv_committing.begin();
-	     it != kv_committing.end();
-	     ++it) {
-	  _txc_update_fm((*it));
-	  db->submit_transaction((*it)->t);
+      // allocations and deallocations
+      for (std::deque<TransContext *>::iterator it = wal_cleaning.begin();
+	  it != wal_cleaning.end();
+	  ++it) {
+	TransContext *txc = *it;
+	if (!txc->wal_txn->released.empty()) {
+	  dout(20) << __func__ << " txc " << txc
+	    << " (post-wal) released " << txc->wal_txn->released
+	    << dendl;
+	  for (interval_set<uint64_t>::iterator p =
+	      txc->wal_txn->released.begin();
+	      p != txc->wal_txn->released.end();
+	      ++p) {
+	    dout(20) << __func__ << " release " << p.get_start()
+	      << "~" << p.get_len() << dendl;
+	    fm->release(p.get_start(), p.get_len(), t);
+	    if (!g_conf->bluestore_debug_no_reuse_blocks)
+	      alloc->release(p.get_start(), p.get_len());
+	  }
 	}
       }
 
@@ -3941,6 +3984,7 @@ void BlueStore::_kv_sync_thread()
 	t->rmkey(PREFIX_WAL, key);
       }
       db->submit_transaction_sync(t);
+
       utime_t finish = ceph_clock_now(NULL);
       utime_t dur = finish - start;
       dout(20) << __func__ << " committed " << kv_committing.size()
@@ -3991,6 +4035,16 @@ int BlueStore::_wal_apply(TransContext *txc)
   txc->log_state_latency(logger, l_bluestore_state_wal_queued_lat);
   txc->state = TransContext::STATE_WAL_APPLYING;
 
+  if (g_conf->bluestore_inject_wal_apply_delay) {
+    dout(20) << __func__ << " bluestore_inject_wal_apply_delay "
+	     << g_conf->bluestore_inject_wal_apply_delay
+	     << dendl;
+    utime_t t;
+    t.set_from_double(g_conf->bluestore_inject_wal_apply_delay);
+    t.sleep();
+    dout(20) << __func__ << " finished sleep" << dendl;
+  }
+
   assert(txc->ioc.pending_aios.empty());
   vector<OnodeRef>::iterator q = txc->wal_op_onodes.begin();
   for (list<bluestore_wal_op_t>::iterator p = wt.ops.begin();
@@ -4009,8 +4063,10 @@ int BlueStore::_wal_finish(TransContext *txc)
   bluestore_wal_transaction_t& wt = *txc->wal_txn;
   dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl;
 
+  std::lock_guard<std::mutex> l2(txc->osr->qlock);
   std::lock_guard<std::mutex> l(kv_lock);
   txc->state = TransContext::STATE_WAL_CLEANUP;
+  txc->osr->qcond.notify_all();
   wal_cleanup_queue.push_back(txc);
   kv_cond.notify_one();
   return 0;
@@ -4877,6 +4933,7 @@ void BlueStore::_dump_onode(OnodeRef o, int log_level)
 }
 
 void BlueStore::_pad_zeros(
+  TransContext *txc,
   OnodeRef o,
   bufferlist *bl, uint64_t *offset, uint64_t *length,
   uint64_t block_size)
@@ -4921,10 +4978,11 @@ void BlueStore::_pad_zeros(
     bl->substr_of(old, 0, *length - back_copy);
     bl->append(tail);
     *length += back_pad;
-    if (end > o->onode.size && g_conf->bluestore_cache_tails) {
+    if (end >= o->onode.size && g_conf->bluestore_cache_tails) {
       o->tail_bl.clear();
       o->tail_bl.append(tail, 0, back_copy);
       o->tail_offset = end - back_copy;
+      o->tail_txc_seq = txc->seq;
       dout(20) << __func__ << " cached "<< back_copy << " of tail block at "
 	       << o->tail_offset << dendl;
     }
@@ -4971,6 +5029,7 @@ void BlueStore::_pad_zeros_head(
 }
 
 void BlueStore::_pad_zeros_tail(
+  TransContext *txc,
   OnodeRef o,
   bufferlist *bl, uint64_t offset, uint64_t *length,
   uint64_t block_size)
@@ -5002,10 +5061,12 @@ void BlueStore::_pad_zeros_tail(
   bl->substr_of(old, 0, *length - back_copy);
   bl->append(tail);
   *length += back_pad;
-  if (end > o->onode.size && g_conf->bluestore_cache_tails) {
+  if (tail_len == block_size &&
+      end >= o->onode.size && g_conf->bluestore_cache_tails) {
     o->tail_bl.clear();
     o->tail_bl.append(tail, 0, back_copy);
     o->tail_offset = end - back_copy;
+    o->tail_txc_seq = txc->seq;
     dout(20) << __func__ << " cached "<< back_copy << " of tail block at "
 	     << o->tail_offset << dendl;
   }
@@ -5067,7 +5128,8 @@ int BlueStore::_do_allocate(
   bool shared_head = false;
   bool shared_tail = false;
   uint64_t orig_end = orig_offset + orig_length;
-  if (orig_offset / min_alloc_size == (orig_end - 1)/ min_alloc_size && (orig_length != min_alloc_size)) {
+  if (orig_offset / min_alloc_size == (orig_end - 1) / min_alloc_size &&
+      (orig_length != min_alloc_size)) {
     // we fall within the same block
     offset = orig_offset - orig_offset % min_alloc_size;
     length = 0;
@@ -5355,6 +5417,11 @@ int BlueStore::_do_write(
   uint64_t cow_rmw_head = 0;
   uint64_t cow_rmw_tail = 0;
 
+  if (orig_offset > o->onode.size) {
+    // zero tail of previous existing extent?
+    _do_zero_tail_extent(txc, c, o, orig_offset);
+  }
+
   r = _do_allocate(txc, c, o, orig_offset, orig_length, fadvise_flags, true,
 		   &cow_rmw_head, &cow_rmw_tail);
   if (r < 0) {
@@ -5364,25 +5431,6 @@ int BlueStore::_do_write(
 
   bp = o->onode.seek_extent(orig_offset);
 
-  // zero tail of previous existing extent?
-  // (this happens if the old eof was partway through a previous extent,
-  // and we implicitly zero the rest of it by writing to a larger offset.)
-  if (orig_offset > o->onode.size) {
-    uint64_t end = ROUND_UP_TO(o->onode.size, block_size);
-    map<uint64_t, bluestore_extent_t>::iterator pp = o->onode.find_extent(end);
-    if (orig_offset > end &&
-	pp != o->onode.block_map.end() &&
-	pp != bp) {
-      assert(pp->first < bp->first);
-      uint64_t x_off = end - pp->first;
-      uint64_t x_len = pp->second.length - x_off;
-      dout(10) << __func__ << " zero tail " << x_off << "~" << x_len
-	       << " of prior extent " << pp->first << ": " << pp->second
-	       << dendl;
-      bdev->aio_zero(pp->second.offset + x_off, x_len, &txc->ioc);
-    }
-  }
-
   for (uint64_t offset = orig_offset;
        offset < orig_offset + orig_length;
        offset += length) {
@@ -5424,12 +5472,12 @@ int BlueStore::_do_write(
     assert(offset >= bp->first);
     assert(offset + length <= bp->first + bp->second.length);
 
-    // (pad and) overwrite unused portion of extent for an append?
+    // overwrite unused portion of extent for an append?
     if (offset > bp->first &&
-	offset >= o->onode.size &&                                  // past eof +
-	(offset / block_size != (o->onode.size - 1) / block_size)) {// diff block
-      dout(20) << __func__ << " append" << dendl;
-      _pad_zeros(o, &bl, &offset, &length, block_size);
+	offset >= o->onode.size &&                  // past eof +
+	(o->onode.size & ~block_mask) == 0) {       // eof was aligned
+      dout(20) << __func__ << " append after aligned eof" << dendl;
+      _pad_zeros(txc, o, &bl, &offset, &length, block_size);
       assert(offset % block_size == 0);
       assert(length % block_size == 0);
       uint64_t x_off = offset - bp->first;
@@ -5466,10 +5514,13 @@ int BlueStore::_do_write(
     if (offset >= bp->first &&
 	offset > tail_start &&
 	offset + length >= o->onode.size &&
+	o->tail_offset == tail_start &&
 	o->tail_bl.length() &&
 	(offset / block_size == (o->onode.size - 1) / block_size)) {
       dout(20) << __func__ << " using cached tail" << dendl;
       assert((offset & block_mask) == (o->onode.size & block_mask));
+      // wait for any related wal writes to commit
+      txc->osr->wait_for_wal_on_seq(o->tail_txc_seq);
       uint64_t tail_off = offset % block_size;
       if (tail_off >= o->tail_bl.length()) {
 	bufferlist t;
@@ -5497,7 +5548,7 @@ int BlueStore::_do_write(
 	     offset == bp->first);
       bp->second.clear_flag(bluestore_extent_t::FLAG_COW_HEAD);
       bp->second.clear_flag(bluestore_extent_t::FLAG_UNWRITTEN);
-      _pad_zeros(o, &bl, &offset, &length, block_size);
+      _pad_zeros(txc, o, &bl, &offset, &length, block_size);
       uint64_t x_off = offset - bp->first;
       dout(20) << __func__ << " write " << offset << "~" << length
 	       << " x_off " << x_off << dendl;
@@ -5541,7 +5592,7 @@ int BlueStore::_do_write(
 	_pad_zeros_head(o, &bl, &offset, &length, block_size);
       }
       if (((offset + length) & ~block_mask) != 0 && !cow_rmw_tail) {
-	_pad_zeros_tail(o, &bl, offset, &length, block_size);
+	_pad_zeros_tail(txc, o, &bl, offset, &length, block_size);
       }
       if ((offset & ~block_mask) == 0 && (length & ~block_mask) == 0) {
 	uint64_t x_off = offset - bp->first;
@@ -5563,9 +5614,24 @@ int BlueStore::_do_write(
       goto out;
     assert(bp->first <= offset);
     assert(offset + length <= bp->first + bp->second.length);
+    bool is_orig_offset = offset == orig_offset;
+    if (offset > o->onode.size &&
+	o->onode.size > bp->first) {
+      uint64_t zlen = offset - o->onode.size;
+      dout(20) << __func__ << " padding " << zlen << " zeroes from eof "
+	       << o->onode.size << " to " << offset << dendl;
+      bufferlist z;
+      z.append_zero(zlen);
+      z.claim_append(bl);
+      bl.swap(z);
+      offset -= zlen;
+      length += zlen;
+      if (cow_rmw_head)
+	cow_rmw_head -= zlen;
+    }
     bluestore_wal_op_t *op = _get_wal_op(txc, o);
     op->op = bluestore_wal_op_t::OP_WRITE;
-    if (offset == orig_offset && cow_rmw_head) {
+    if (is_orig_offset && cow_rmw_head) {
       op->src_rmw_head = cow_rmw_head;
       dout(20) << __func__ << " src_rmw_head " << op->src_rmw_head << dendl;
     }
@@ -5575,7 +5641,7 @@ int BlueStore::_do_write(
     } else if (((offset + length) & ~block_mask) &&
 	       offset + length > o->onode.size) {
       dout(20) << __func__ << " past eof, padding out tail block" << dendl;
-      _pad_zeros_tail(o, &bl, offset, &length, block_size);
+      _pad_zeros_tail(txc, o, &bl, offset, &length, block_size);
     }
     bp->second.clear_flag(bluestore_extent_t::FLAG_COW_HEAD);
     bp->second.clear_flag(bluestore_extent_t::FLAG_COW_TAIL);
@@ -5653,7 +5719,11 @@ int BlueStore::_do_write_zero(
 {
   bufferlist zl;
   zl.append_zero(length);
-  return _do_write(txc, c, o, offset, length, zl, 0);
+  uint64_t old_size = o->onode.size;
+  int r = _do_write(txc, c, o, offset, length, zl, 0);
+  // we do not modify onode size
+  o->onode.size = old_size;
+  return r;
 }
 
 int BlueStore::_zero(TransContext *txc,
@@ -5664,9 +5734,92 @@ int BlueStore::_zero(TransContext *txc,
   dout(15) << __func__ << " " << c->cid << " " << o->oid
 	   << " " << offset << "~" << length
 	   << dendl;
+  int r = _do_zero(txc, c, o, offset, length);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_zero_tail_extent(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  uint64_t offset)
+{
+  const uint64_t block_size = bdev->get_block_size();
+  const uint64_t block_mask = ~(block_size - 1);
+
+  map<uint64_t, bluestore_extent_t>::iterator bp, pp;
+  bp = o->onode.seek_extent(offset);
+  pp = o->onode.find_extent(o->onode.size);
+
+  dout(10) << __func__ << " offset " << offset << " extent "
+	   << pp->first << ": " << pp->second << dendl;
+  assert(offset > o->onode.size);
+
+  // we assume the caller will handle any partial block they start with
+  offset &= block_mask;
+  if (offset <= o->onode.size)
+    return;
+
+  if (pp != o->onode.block_map.end() &&
+      pp != bp) {
+    if (pp->second.has_flag(bluestore_extent_t::FLAG_SHARED)) {
+      dout(10) << __func__ << " shared tail extent; doing _do_write_zero"
+	       << dendl;
+      uint64_t old_size = o->onode.size;
+      uint64_t end = pp->first + pp->second.length;
+      uint64_t zlen = end - old_size;
+      _do_write_zero(txc, c, o, old_size, zlen);
+    } else {
+      uint64_t end_block = ROUND_UP_TO(o->onode.size, block_size);
+
+      if (end_block > o->onode.size) {
+	// end was in a partial block, do wal r/m/w.
+	bluestore_wal_op_t *op = _get_wal_op(txc, o);
+	op->op = bluestore_wal_op_t::OP_ZERO;
+	uint64_t x_off = o->onode.size;
+	uint64_t x_len = end_block - x_off;
+	op->extent.offset = pp->second.offset + x_off - pp->first;
+	op->extent.length = x_len;
+	dout(10) << __func__ << " wal zero tail partial block "
+		 << x_off << "~" << x_len << " at " << op->extent
+		 << dendl;
+	assert(!pp->second.has_flag(bluestore_extent_t::FLAG_COW_HEAD));
+	assert(!pp->second.has_flag(bluestore_extent_t::FLAG_COW_TAIL));
+      }
+      if (offset > end_block) {
+	// end was block-aligned.  zero the rest of the extent now.
+	uint64_t x_off = end_block - pp->first;
+	uint64_t x_len = pp->second.length - x_off;
+	if (x_len > 0) {
+	  dout(10) << __func__ << " zero tail " << x_off << "~" << x_len
+		   << " of tail extent " << pp->first << ": " << pp->second
+		   << dendl;
+	  bdev->aio_zero(pp->second.offset + x_off, x_len, &txc->ioc);
+	}
+      }
+    }
+  }
+}
+
+int BlueStore::_do_zero(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << dendl;
   int r = 0;
   o->exists = true;
 
+  if (offset > o->onode.size) {
+    // we are past eof; just truncate up.
+    return _do_truncate(txc, c, o, offset + length);
+  }
+
   _dump_onode(o);
   _assign_nid(txc, o);
 
@@ -5674,25 +5827,8 @@ int BlueStore::_zero(TransContext *txc,
   _do_overlay_trim(txc, o, offset, length);
 
   uint64_t block_size = bdev->get_block_size();
-  map<uint64_t,bluestore_extent_t>::iterator bp = o->onode.seek_extent(offset);
-
-  // zero tail of previous existing extent?
-  // (this happens if the old eof was partway through a previous extent,
-  // and we implicitly zero the rest of it by writing to a larger offset.)
-  if (offset > o->onode.size) {
-    uint64_t end = ROUND_UP_TO(o->onode.size, block_size);
-    map<uint64_t, bluestore_extent_t>::iterator pp = o->onode.find_extent(end);
-    if (offset > end &&
-	pp != o->onode.block_map.end()) {
-      uint64_t x_off = end - pp->first;
-      uint64_t x_len = pp->second.length - x_off;
-      dout(10) << __func__ << " zero tail " << x_off << "~" << x_len
-	       << " of prior extent " << pp->first << ": " << pp->second
-	       << dendl;
-      bdev->aio_zero(pp->second.offset + x_off, x_len, &txc->ioc);
-    }
-  }
 
+  map<uint64_t,bluestore_extent_t>::iterator bp = o->onode.seek_extent(offset);
   while (bp != o->onode.block_map.end()) {
     if (bp->first >= offset + length)
       break;
@@ -5714,6 +5850,14 @@ int BlueStore::_zero(TransContext *txc,
     // start,end are offsets in the extent
     uint64_t x_off = 0;
     if (offset > bp->first) {
+      if (offset > o->onode.size &&
+	  o->onode.size >= bp->first) {
+	uint64_t zlen = offset - o->onode.size;
+	dout(10) << __func__ << " extending range by " << zlen
+		 << " to start from eof " << o->onode.size << dendl;
+	offset -= zlen;
+	length += zlen;
+      }
       x_off = offset - bp->first;
     }
     uint64_t x_len = MIN(offset + length - bp->first,
@@ -5832,7 +5976,6 @@ int BlueStore::_do_truncate(
         int r = _do_write_zero(txc, c, o, old_size, x_len);
         if (r < 0)
           return r;
-	o->onode.size = offset; // we (maybe) just wrote past eof; reset size
       } else {
 	bluestore_wal_op_t *op = _get_wal_op(txc, o);
 	op->op = bluestore_wal_op_t::OP_ZERO;
@@ -5842,26 +5985,6 @@ int BlueStore::_do_truncate(
 		 << " " << op->extent << dendl;
       }
     }
-  } else if (offset < old_size &&
-	     offset % block_size != 0) {
-    // zero trailing block?
-    map<uint64_t,bluestore_extent_t>::iterator bp = o->onode.find_extent(offset);
-    if (bp != o->onode.block_map.end()) {
-      uint64_t z_len = block_size - offset % block_size;
-      if (bp->second.has_flag(bluestore_extent_t::FLAG_SHARED)) {
-        int r = _do_write_zero(txc, c, o, offset, z_len);
-        if (r < 0)
-          return r;
-	o->onode.size = offset; // we just wrote past eof; reset size
-      } else {
-	bluestore_wal_op_t *op = _get_wal_op(txc, o);
-	op->op = bluestore_wal_op_t::OP_ZERO;
-	op->extent.offset = bp->second.offset + offset - bp->first;
-	op->extent.length = block_size - offset % block_size;
-	dout(20) << __func__ << " wal zero tail " << offset << "~" << z_len
-		 << " at " << op->extent << dendl;
-      }
-    }
   }
 
   // trim down overlays
@@ -6057,6 +6180,8 @@ int BlueStore::_omap_clear(TransContext *txc,
   int r = 0;
   if (o->onode.omap_head != 0) {
     _do_omap_clear(txc, o->onode.omap_head);
+    o->onode.omap_head = 0;
+    txc->write_onode(o);
   }
   dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
   return r;
@@ -6375,8 +6500,10 @@ int BlueStore::_rename(TransContext *txc,
   txc->t->rmkey(PREFIX_OBJ, oldo->key);
   txc->write_onode(oldo);
   newo = oldo;
-  oldo.reset(NULL);
-  c->onode_map.rename(old_oid, new_oid);  // this adjusts oldo->{oid,key}
+
+  // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
+  // Onode in the old slot
+  c->onode_map.rename(oldo, old_oid, new_oid);
   r = 0;
 
  out:
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index ede2b39..828dd91 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -136,7 +136,8 @@ public:
     std::condition_variable flush_cond;   ///< wait here for unapplied txns
     set<TransContext*> flush_txns;   ///< committing or wal txns
 
-    uint64_t tail_offset;
+    uint64_t tail_offset = 0;
+    uint64_t tail_txc_seq = 0;
     bufferlist tail_bl;
 
     Onode(const ghobject_t& o, const string& k)
@@ -173,16 +174,18 @@ public:
     std::mutex lock;
     ceph::unordered_map<ghobject_t,OnodeRef> onode_map;  ///< forward lookups
     lru_list_t lru;                                      ///< lru
+    size_t max_size;
 
-    OnodeHashLRU() {}
+    OnodeHashLRU(size_t s) : max_size(s) {}
 
     void add(const ghobject_t& oid, OnodeRef o);
     void _touch(OnodeRef o);
     OnodeRef lookup(const ghobject_t& o);
-    void rename(const ghobject_t& old_oid, const ghobject_t& new_oid);
+    void rename(OnodeRef& o, const ghobject_t& old_oid, const ghobject_t& new_oid);
     void clear();
     bool get_next(const ghobject_t& after, pair<ghobject_t,OnodeRef> *next);
     int trim(int max=-1);
+    int _trim(int max);
   };
 
   struct Collection : public CollectionImpl {
@@ -193,12 +196,12 @@ public:
 
     bool exists;
 
+    EnodeSet enode_set;      ///< open Enodes
+
     // cache onodes on a per-collection basis to avoid lock
     // contention.
     OnodeHashLRU onode_map;
 
-    EnodeSet enode_set;      ///< open Enodes
-
     OnodeRef get_onode(const ghobject_t& oid, bool create);
     EnodeRef get_enode(uint32_t hash);
 
@@ -312,6 +315,7 @@ public:
 
     CollectionRef first_collection;  ///< first referenced collection
 
+    uint64_t seq = 0;
     utime_t start;
 
     explicit TransContext(OpSequencer *o)
@@ -367,6 +371,8 @@ public:
     std::mutex wal_apply_mutex;
     std::unique_lock<std::mutex> wal_apply_lock;
 
+    uint64_t last_seq = 0;
+
     OpSequencer()
 	//set the qlock to to PTHREAD_MUTEX_RECURSIVE mode
       : parent(NULL),
@@ -378,6 +384,7 @@ public:
 
     void queue_new(TransContext *txc) {
       std::lock_guard<std::mutex> l(qlock);
+      txc->seq = ++last_seq;
       q.push_back(*txc);
     }
 
@@ -400,6 +407,28 @@ public:
       txc->oncommits.push_back(c);
       return false;
     }
+
+    /// if there is a wal on @seq, wait for it to apply
+    void wait_for_wal_on_seq(uint64_t seq) {
+      std::unique_lock<std::mutex> l(qlock);
+      restart:
+      for (OpSequencer::q_list_t::reverse_iterator p = q.rbegin();
+	   p != q.rend();
+	   ++p) {
+	if (p->seq == seq) {
+	  TransContext *txc = &(*p);
+	  if (txc->wal_txn) {
+	    while (txc->state < TransContext::STATE_WAL_CLEANUP) {
+	      txc->osr->qcond.wait(l);
+	      goto restart;  // txc may have gone away
+	    }
+	  }
+	  break;
+	}
+	if (p->seq < seq)
+	  break;
+      }
+    }
   };
 
   class WALWQ : public ThreadPool::WorkQueue<TransContext> {
@@ -613,7 +642,8 @@ private:
   int _wal_replay();
 
   // for fsck
-  int _verify_enode_shared(EnodeRef enode, vector<bluestore_extent_t>& v);
+  int _verify_enode_shared(EnodeRef enode, vector<bluestore_extent_t>& v,
+			   interval_set<uint64_t> &used_blocks);
 
 public:
   BlueStore(CephContext *cct, const string& path);
@@ -837,12 +867,14 @@ private:
   int _do_write_overlays(TransContext *txc, CollectionRef& c, OnodeRef o,
 			 uint64_t offset, uint64_t length);
   void _do_read_all_overlays(bluestore_wal_op_t& wo);
-  void _pad_zeros(OnodeRef o, bufferlist *bl, uint64_t *offset, uint64_t *length,
+  void _pad_zeros(TransContext *txc,
+		  OnodeRef o, bufferlist *bl, uint64_t *offset, uint64_t *length,
 		  uint64_t block_size);
   void _pad_zeros_head(OnodeRef o, bufferlist *bl,
 		       uint64_t *offset, uint64_t *length,
 		       uint64_t block_size);
-  void _pad_zeros_tail(OnodeRef o, bufferlist *bl,
+  void _pad_zeros_tail(TransContext *txc,
+		       OnodeRef o, bufferlist *bl,
 		       uint64_t offset, uint64_t *length,
 		       uint64_t block_size);
   int _do_allocate(TransContext *txc,
@@ -866,6 +898,15 @@ private:
 		     CollectionRef &c,
 		     OnodeRef o,
 		     uint64_t offset, uint64_t length);
+  void _do_zero_tail_extent(
+    TransContext *txc,
+    CollectionRef& c,
+    OnodeRef& o,
+    uint64_t offset);
+  int _do_zero(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       uint64_t offset, size_t len);
   int _zero(TransContext *txc,
 	    CollectionRef& c,
 	    OnodeRef& o,
diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc
index 20480a7..0115767 100644
--- a/src/os/bluestore/FreelistManager.cc
+++ b/src/os/bluestore/FreelistManager.cc
@@ -36,6 +36,14 @@ int FreelistManager::init(KeyValueDB *db, string p)
 
     total_free += length;
 
+    if (offset < last_offset + last_length) {
+      derr << __func__ << " detected overlapping extent on load, had "
+	   << last_offset << "~" << last_length
+	   << " and got "
+	   << offset << "~" << length
+	   << dendl;
+      return -EIO;
+    }
     if (offset && offset == last_offset + last_length) {
       derr << __func__ << " detected contiguous extent on load, merging "
 	   << last_offset << "~" << last_length << " with "
diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc
index 5c20a45..08304e8 100644
--- a/src/os/bluestore/KernelDevice.cc
+++ b/src/os/bluestore/KernelDevice.cc
@@ -366,7 +366,9 @@ int KernelDevice::aio_write(
   bool buffered)
 {
   uint64_t len = bl.length();
-  dout(20) << __func__ << " " << off << "~" << len << dendl;
+  dout(20) << __func__ << " " << off << "~" << len
+	   << (buffered ? " (buffered)" : " (direct)")
+	   << dendl;
   assert(off % block_size == 0);
   assert(len % block_size == 0);
   assert(len > 0);
@@ -471,7 +473,9 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
 		      IOContext *ioc,
 		      bool buffered)
 {
-  dout(5) << __func__ << " " << off << "~" << len << dendl;
+  dout(5) << __func__ << " " << off << "~" << len
+	  << (buffered ? " (buffered)" : " (direct)")
+	  << dendl;
   assert(off % block_size == 0);
   assert(len % block_size == 0);
   assert(len > 0);
diff --git a/src/os/bluestore/bluefs_tool.cc b/src/os/bluestore/bluefs_tool.cc
index 02bcb2f..3738565 100644
--- a/src/os/bluestore/bluefs_tool.cc
+++ b/src/os/bluestore/bluefs_tool.cc
@@ -14,6 +14,11 @@
 
 #include "os/bluestore/BlueFS.h"
 
+void usage(char **argv)
+{
+  cout << argv[0] << " <outdir> <bdev[0..2]>" << std::endl;;
+}
+
 int main(int argc, char **argv)
 {
   vector<const char*> args;
@@ -29,6 +34,11 @@ int main(int argc, char **argv)
 
   BlueFS fs;
 
+  if (args.size() != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
   cout << "args " << args << std::endl;
   string outdir = args[0];
   for (unsigned i = 1; i < args.size(); ++i) {
@@ -73,7 +83,7 @@ int main(int argc, char **argv)
 	  r = fs.read(h, &h->buf, pos, left, &bl, NULL);
 	  assert(r > 0);
 	  int rc = bl.write_fd(fd);
-	  assert(rc == r);
+	  assert(rc == 0);
 	  pos += r;
 	  left -= r;
 	}
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
index 6677c86..97b952e 100644
--- a/src/os/filestore/FileStore.cc
+++ b/src/os/filestore/FileStore.cc
@@ -3325,7 +3325,13 @@ int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset,
     ret = _write(cid, oid, offset, len, bl);
   }
 
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(DARWIN) && !defined(__FreeBSD__)
+#    ifdef FALLOC_FL_KEEP_SIZE
  out:
+#    endif
+# endif
+#endif
   dout(20) << "zero " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
   return ret;
 }
diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc
index 87d5a75..5b39f96 100644
--- a/src/osd/ClassHandler.cc
+++ b/src/osd/ClassHandler.cc
@@ -72,8 +72,10 @@ int ClassHandler::open_all_classes()
 
 void ClassHandler::shutdown()
 {
-  for (map<string, ClassData>::iterator p = classes.begin(); p != classes.end(); ++p) {
-    dlclose(p->second.handle);
+  for (auto& cls : classes) {
+    if (cls.second.handle) {
+      dlclose(cls.second.handle);
+    }
   }
   classes.clear();
 }
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 57242a4..1ed664f 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1786,28 +1786,19 @@ bool OSD::asok_command(string command, cmdmap_t& cmdmap, string format,
     store->flush_journal();
   } else if (command == "dump_ops_in_flight" ||
 	     command == "ops") {
-    RWLock::RLocker l(op_tracker.lock);
-    if (!op_tracker.tracking_enabled) {
+    if (!op_tracker.dump_ops_in_flight(f)) {
       ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
 	Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
-    } else {
-      op_tracker.dump_ops_in_flight(f);
     }
   } else if (command == "dump_blocked_ops") {
-    if (!op_tracker.tracking_enabled) {
+    if (!op_tracker.dump_ops_in_flight(f, true)) {
       ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
 	Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
-    } else {
-      op_tracker.dump_ops_in_flight(f, true);
     }
-
   } else if (command == "dump_historic_ops") {
-    RWLock::RLocker l(op_tracker.lock);
-    if (!op_tracker.tracking_enabled) {
+    if (!op_tracker.dump_historic_ops(f)) {
       ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
 	Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
-    } else {
-      op_tracker.dump_historic_ops(f);
     }
   } else if (command == "dump_op_pq_state") {
     f->open_object_section("pq");
@@ -2028,6 +2019,9 @@ int OSD::init()
     daily_loadavg = 1.0;
   }
 
+  int rotating_auth_attempts = 0;
+  const int max_rotating_auth_attempts = 10;
+
   // read superblock
   r = read_superblock();
   if (r < 0) {
@@ -2177,6 +2171,14 @@ int OSD::init()
 
   while (monc->wait_auth_rotating(30.0) < 0) {
     derr << "unable to obtain rotating service keys; retrying" << dendl;
+    ++rotating_auth_attempts;
+    if (rotating_auth_attempts > max_rotating_auth_attempts) {
+        osd_lock.Lock(); // make locker happy
+        if (!is_stopping()) {
+            r = - ETIMEDOUT;
+        }
+        goto monout;
+    }
   }
 
   osd_lock.Lock();
@@ -2456,7 +2458,6 @@ void OSD::create_logger()
   osd_plb.add_u64(l_osd_pg_replica, "numpg_replica", "Placement groups for which this osd is replica"); // num replica pgs
   osd_plb.add_u64(l_osd_pg_stray, "numpg_stray", "Placement groups ready to be deleted from this osd");   // num stray pgs
   osd_plb.add_u64(l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");     // heartbeat peers we send to
-  osd_plb.add_u64(l_osd_hb_from, "heartbeat_from_peers", "Heartbeat (ping) peers we recv from"); // heartbeat peers we recv from
   osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");           // osdmap messages
   osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");         // osdmap epochs
   osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); // dup osdmap epochs
@@ -2517,7 +2518,7 @@ void OSD::create_recoverystate_perf()
   rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
   rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
   rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
-  rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency", "RepRecovering recovery state latency");
+  rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
   rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
   rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
   rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
@@ -3358,14 +3359,7 @@ void OSD::build_past_intervals_parallel()
     PG *pg = i->first;
     pistate& p = i->second;
 
-    // Verify same_interval_since is correct
-    if (pg->info.history.same_interval_since) {
-      if (pg->info.history.same_interval_since != p.same_interval_since) {
-	dout(0) << __func__ << " history same_interval_since " << pg->info.history.same_interval_since << dendl;
-	dout(0) << __func__ << " same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
-      }
-      assert(pg->info.history.same_interval_since == p.same_interval_since);
-    } else {
+    if (pg->info.history.same_interval_since == 0) {
       assert(p.same_interval_since);
       dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
       dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
@@ -3409,7 +3403,6 @@ void OSD::handle_pg_peering_evt(
   const pg_history_t& orig_history,
   pg_interval_map_t& pi,
   epoch_t epoch,
-  bool same_primary,
   PG::CephPeeringEvtRef evt)
 {
   if (service.splitting(pgid)) {
@@ -3431,14 +3424,7 @@ void OSD::handle_pg_peering_evt(
     bool valid_history = project_pg_history(
       pgid, history, epoch, up, up_primary, acting, acting_primary);
 
-    if (same_primary && epoch < history.same_primary_since) {
-      dout(10) << "get_or_create_pg " << pgid << " primary changed in "
-	       << history.same_primary_since << " (msg from " << epoch << ")"
-	       << dendl;
-      return;
-    }
-    if (!valid_history ||
-	(!same_primary && epoch < history.same_interval_since)) {
+    if (!valid_history || epoch < history.same_interval_since) {
       dout(10) << "get_or_create_pg " << pgid << " acting changed in "
 	       << history.same_interval_since << " (msg from " << epoch << ")"
 	       << dendl;
@@ -3564,15 +3550,7 @@ void OSD::handle_pg_peering_evt(
   } else {
     // already had it.  did the mapping change?
     PG *pg = _lookup_lock_pg(pgid);
-    if (same_primary && epoch < pg->info.history.same_primary_since) {
-      dout(10) << "get_or_create_pg " << pgid << " primary changed in "
-	       << pg->info.history.same_primary_since
-	       << " (msg from " << epoch << ")"
-	       << dendl;
-      pg->unlock();
-      return;
-    }
-    if (!same_primary && epoch < pg->info.history.same_interval_since) {
+    if (epoch < pg->info.history.same_interval_since) {
       dout(10) << *pg << " get_or_create_pg acting changed in "
 	       << pg->info.history.same_interval_since
 	       << " (msg from " << epoch << ")" << dendl;
@@ -3949,7 +3927,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
       if (i != heartbeat_peers.end()) {
 	if (m->get_connection() == i->second.con_back) {
 	  dout(25) << "handle_osd_ping got reply from osd." << from
-		   << " first_rx " << i->second.first_tx
+		   << " first_tx " << i->second.first_tx
 		   << " last_tx " << i->second.last_tx
 		   << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
 		   << " last_rx_front " << i->second.last_rx_front
@@ -3960,7 +3938,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
 	    i->second.last_rx_front = m->stamp;
 	} else if (m->get_connection() == i->second.con_front) {
 	  dout(25) << "handle_osd_ping got reply from osd." << from
-		   << " first_rx " << i->second.first_tx
+		   << " first_tx " << i->second.first_tx
 		   << " last_tx " << i->second.last_tx
 		   << " last_rx_back " << i->second.last_rx_back
 		   << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
@@ -4043,6 +4021,13 @@ void OSD::heartbeat_check()
   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
        p != heartbeat_peers.end();
        ++p) {
+
+    if (p->second.first_tx == utime_t()) {
+      dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
+               << "yet, skipping" << dendl;
+      continue;
+    }
+
     dout(25) << "heartbeat_check osd." << p->first
 	     << " first_tx " << p->second.first_tx
 	     << " last_tx " << p->second.last_tx
@@ -4121,7 +4106,6 @@ void OSD::heartbeat()
   heartbeat_check();
 
   logger->set(l_osd_hb_to, heartbeat_peers.size());
-  logger->set(l_osd_hb_from, 0);
 
   // hmm.. am i all alone?
   dout(30) << "heartbeat lonely?" << dendl;
@@ -4284,6 +4268,7 @@ void OSD::tick_without_osd_lock()
 	dout(20) << __func__ << " stats backoff " << backoff
 		 << " adjusted_min " << adjusted_min << " - sending report"
 		 << dendl;
+        osd_stat_updated = true;
 	report = true;
       }
     }
@@ -4933,6 +4918,17 @@ void OSD::request_full_map(epoch_t first, epoch_t last)
   monc->send_mon_message(req);
 }
 
+void OSD::finish_full_map_request()
+{
+  if (requested_full_first == 0 && requested_full_last == 0)
+    return;
+  //Had requested some map but didn't receive in this message,
+  //This might because monitor capping the message to osd_map_message_max
+  dout(10) << __func__ << "still missing " << requested_full_first
+	   << ".." << requested_full_last << ", but now give up." << dendl;
+  requested_full_first = requested_full_last = 0;
+}
+
 void OSD::got_full_map(epoch_t e)
 {
   assert(requested_full_first <= requested_full_last);
@@ -4973,9 +4969,9 @@ void OSD::requeue_failures()
   unsigned old_pending = failure_pending.size();
   for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
 	 failure_pending.begin();
-       p != failure_pending.end();
-       ++p) {
+       p != failure_pending.end(); ) {
     failure_queue[p->first] = p->second.first;
+    failure_pending.erase(p++);
   }
   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
 	   << failure_queue.size() << dendl;
@@ -5053,7 +5049,7 @@ void OSD::send_pg_stats(const utime_t &now)
       pg->pg_stats_publish_lock.Unlock();
     }
 
-    if (!outstanding_pg_stats.empty()) {
+    if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
       last_pg_stats_ack = ceph_clock_now(cct);
     }
     outstanding_pg_stats.insert(tid);
@@ -5070,7 +5066,6 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
   dout(10) << "handle_pg_stats_ack " << dendl;
 
   if (!require_mon_peer(ack)) {
-    ack->put();
     return;
   }
 
@@ -5567,7 +5562,7 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
       f->open_object_section("osd_bench_results");
       f->dump_int("bytes_written", count);
       f->dump_int("blocksize", bsize);
-      f->dump_float("bytes_per_sec", rate);
+      f->dump_unsigned("bytes_per_sec", rate);
       f->close_section();
       f->flush(ss);
     } else {
@@ -5605,19 +5600,11 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
 	goto out;
     }
 
-    std::set <spg_t> keys;
+    fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
     RWLock::RLocker l(pg_map_lock);
     for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
 	 pg_map_e != pg_map.end(); ++pg_map_e) {
-      keys.insert(pg_map_e->first);
-    }
-
-    fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
-    for (std::set <spg_t>::iterator p = keys.begin();
-	 p != keys.end(); ++p) {
-      ceph::unordered_map<spg_t, PG*>::iterator q = pg_map.find(*p);
-      assert(q != pg_map.end());
-      PG *pg = q->second;
+      PG *pg = pg_map_e->second;
       pg->lock();
 
       fout << *pg << std::endl;
@@ -5871,7 +5858,7 @@ void OSD::ms_fast_dispatch(Message *m)
     m->put();
     return;
   }
-  OpRequestRef op = op_tracker.create_request<OpRequest>(m);
+  OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
   {
 #ifdef WITH_LTTNG
     osd_reqid_t reqid = op->get_reqid();
@@ -6706,6 +6693,9 @@ void OSD::handle_osd_map(MOSDMap *m)
   // even if this map isn't from a mon, we may have satisfied our subscription
   monc->sub_got("osdmap", last);
 
+  if (!m->maps.empty())
+    finish_full_map_request();
+
   if (last <= superblock.newest_map) {
     dout(10) << " no new maps here, dropping" << dendl;
     m->put();
@@ -6756,6 +6746,10 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
 {
   dout(10) << __func__ << " " << first << ".." << last << dendl;
   Mutex::Locker l(osd_lock);
+  if (is_stopping()) {
+    dout(10) << __func__ << " bailing, we are shutting down" << dendl;
+    return;
+  }
   map_lock.get_write();
 
   bool do_shutdown = false;
@@ -6907,6 +6901,12 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
 	       osd_markdown_log.front() + grace < now)
 	  osd_markdown_log.pop_front();
 	if ((int)osd_markdown_log.size() > g_conf->osd_max_markdown_count) {
+	  dout(10) << __func__ << " marked down "
+		   << osd_markdown_log.size()
+		   << " > osd_max_markdown_count "
+		   << g_conf->osd_max_markdown_count
+		   << " in last " << grace << " seconds, shutting down"
+		   << dendl;
 	  do_restart = false;
 	  do_shutdown = true;
 	}
@@ -7495,16 +7495,28 @@ void OSD::handle_pg_create(OpRequestRef op)
     // cannot be on the other side of a map gap
     assert(valid_history);
 
+    // The mon won't resend unless the primary changed, so
+    // we ignore same_interval_since.  We'll pass this history
+    // to handle_pg_peering_evt with the current epoch as the
+    // event -- the project_pg_history check in
+    // handle_pg_peering_evt will be a noop.
+    if (history.same_primary_since > m->epoch) {
+      dout(10) << __func__ << ": got obsolete pg create on pgid "
+	       << pgid << " from epoch " << m->epoch
+	       << ", primary changed in " << history.same_primary_since
+	       << dendl;
+      continue;
+    }
+
     handle_pg_peering_evt(
       pgid,
       history,
       pi,
-      m->epoch,
-      true,  // same primary, bc this is a create
+      osdmap->get_epoch(),
       PG::CephPeeringEvtRef(
 	new PG::CephPeeringEvt(
-	  m->epoch,
-	  m->epoch,
+	  osdmap->get_epoch(),
+	  osdmap->get_epoch(),
 	  PG::NullEvt()))
       );
   }
@@ -7733,7 +7745,6 @@ void OSD::handle_pg_notify(OpRequestRef op)
       spg_t(it->first.info.pgid.pgid, it->first.to),
       it->first.info.history, it->second,
       it->first.query_epoch,
-      false, // same interval
       PG::CephPeeringEvtRef(
 	new PG::CephPeeringEvt(
 	  it->first.epoch_sent, it->first.query_epoch,
@@ -7765,7 +7776,6 @@ void OSD::handle_pg_log(OpRequestRef op)
   handle_pg_peering_evt(
     spg_t(m->info.pgid.pgid, m->to),
     m->info.history, m->past_intervals, m->get_epoch(),
-    false, // same interval
     PG::CephPeeringEvtRef(
       new PG::CephPeeringEvt(
 	m->get_epoch(), m->get_query_epoch(),
@@ -7799,7 +7809,6 @@ void OSD::handle_pg_info(OpRequestRef op)
     handle_pg_peering_evt(
       spg_t(p->first.info.pgid.pgid, p->first.to),
       p->first.info.history, p->second, p->first.epoch_sent,
-      false, // same interval
       PG::CephPeeringEvtRef(
 	new PG::CephPeeringEvt(
 	  p->first.epoch_sent, p->first.query_epoch,
@@ -7831,40 +7840,39 @@ void OSD::handle_pg_trim(OpRequestRef op)
 
   op->mark_started();
 
-  if (!_have_pg(m->pgid)) {
+  PG *pg = _lookup_lock_pg(m->pgid);
+  if(!pg) {
     dout(10) << " don't have pg " << m->pgid << dendl;
-  } else {
-    PG *pg = _lookup_lock_pg(m->pgid);
-    assert(pg);
-    if (m->epoch < pg->info.history.same_interval_since) {
-      dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
-      pg->unlock();
-      return;
-    }
+    return;
+  }
 
-    if (pg->is_primary()) {
-      // peer is informing us of their last_complete_ondisk
-      dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
-      pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
-	m->trim_to;
-      if (pg->calc_min_last_complete_ondisk()) {
-	dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl;
-	pg->trim_peers();
-      }
-    } else {
-      // primary is instructing us to trim
-      ObjectStore::Transaction t;
-      PG::PGLogEntryHandler handler;
-      pg->pg_log.trim(&handler, m->trim_to, pg->info);
-      handler.apply(pg, &t);
-      pg->dirty_info = true;
-      pg->write_if_dirty(t);
-      int tr = store->queue_transaction(
-	pg->osr.get(), std::move(t), NULL);
-      assert(tr == 0);
-    }
+  if (m->epoch < pg->info.history.same_interval_since) {
+    dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
     pg->unlock();
+    return;
+  }
+
+  if (pg->is_primary()) {
+    // peer is informing us of their last_complete_ondisk
+    dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
+    pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
+      m->trim_to;
+    if (pg->calc_min_last_complete_ondisk()) {
+      dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl;
+      pg->trim_peers();
+    }
+  } else {
+    // primary is instructing us to trim
+    ObjectStore::Transaction t;
+    PG::PGLogEntryHandler handler;
+    pg->pg_log.trim(&handler, m->trim_to, pg->info);
+    handler.apply(pg, &t);
+    pg->dirty_info = true;
+    pg->write_if_dirty(t);
+    int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
+    assert(tr == 0);
   }
+  pg->unlock();
 }
 
 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
@@ -7905,12 +7913,11 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
     return;
   }
 
-  PG *pg = 0;
-  if (!_have_pg(m->pgid))
+  PG *pg = _lookup_lock_pg(m->pgid);
+  if (!pg) {
+    dout(10) << " don't have pg " << m->pgid << dendl;
     return;
-
-  pg = _lookup_lock_pg(m->pgid);
-  assert(pg);
+  }
 
   pg->queue_peering_event(evt);
   pg->unlock();
@@ -7954,12 +7961,11 @@ void OSD::handle_pg_recovery_reserve(OpRequestRef op)
     return;
   }
 
-  PG *pg = 0;
-  if (!_have_pg(m->pgid))
+  PG *pg = _lookup_lock_pg(m->pgid);
+  if (!pg) {
+    dout(10) << " don't have pg " << m->pgid << dendl;
     return;
-
-  pg = _lookup_lock_pg(m->pgid);
-  assert(pg);
+  }
 
   pg->queue_peering_event(evt);
   pg->unlock();
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index d6a8d2d..9c4d68c 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -123,7 +123,6 @@ enum {
   l_osd_pg_replica,
   l_osd_pg_stray,
   l_osd_hb_to,
-  l_osd_hb_from,
   l_osd_map,
   l_osd_mape,
   l_osd_mape_dup,
@@ -181,7 +180,7 @@ enum {
   rs_repnotrecovering_latency,
   rs_repwaitrecoveryreserved_latency,
   rs_repwaitbackfillreserved_latency,
-  rs_RepRecovering_latency,
+  rs_reprecovering_latency,
   rs_activating_latency,
   rs_waitlocalrecoveryreserved_latency,
   rs_waitremoterecoveryreserved_latency,
@@ -869,7 +868,7 @@ public:
 	PGQueueable(
 	  PGScrub(pg->get_osdmap()->get_epoch()),
 	  cct->_conf->osd_scrub_cost,
-	  cct->_conf->osd_scrub_priority,
+	  pg->get_scrub_priority(),
 	  ceph_clock_now(cct),
 	  entity_inst_t())));
   }
@@ -1968,7 +1967,6 @@ protected:
     const pg_history_t& orig_history,
     pg_interval_map_t& pi,
     epoch_t epoch,
-    bool same_primary,
     PG::CephPeeringEvtRef evt);
   
   void load_pgs();
@@ -2052,6 +2050,7 @@ protected:
   epoch_t requested_full_first, requested_full_last;
 
   void request_full_map(epoch_t first, epoch_t last);
+  void finish_full_map_request();
   void got_full_map(epoch_t e);
 
   // -- failures --
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 3798652..3d06df6 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -2641,7 +2641,7 @@ int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
     set_max_osd(nosd);
   } else {
     // count osds
-    int maxosd = 0, numosd = 0;
+    int maxosd = 0;
     const md_config_t *conf = cct->_conf;
     vector<string> sections;
     conf->get_all_sections(sections);
@@ -2659,7 +2659,7 @@ int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
 	lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
 	return -ERANGE;
       }
-      numosd++;
+
       if (o > maxosd)
 	maxosd = o;
     }
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 15dc0af..2804c73 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -303,9 +303,12 @@ void PG::proc_master_log(
   might_have_unfound.insert(from);
 
   // See doc/dev/osd_internals/last_epoch_started
-  if (oinfo.last_epoch_started > info.last_epoch_started)
+  if (oinfo.last_epoch_started > info.last_epoch_started) {
     info.last_epoch_started = oinfo.last_epoch_started;
-  info.history.merge(oinfo.history);
+    dirty_info = true;
+  }
+  if (info.history.merge(oinfo.history))
+    dirty_info = true;
   assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 	 info.last_epoch_started >= info.history.last_epoch_started);
 
@@ -487,12 +490,17 @@ bool PG::MissingLoc::readable_with_acting(
 }
 
 void PG::MissingLoc::add_batch_sources_info(
-  const set<pg_shard_t> &sources)
+  const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
 {
   dout(10) << __func__ << ": adding sources in batch " << sources.size() << dendl;
+  unsigned loop = 0;
   for (map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator i = needs_recovery_map.begin();
       i != needs_recovery_map.end();
       ++i) {
+    if (handle && ++loop >= g_conf->osd_loop_before_reset_tphandle) {
+      handle->reset_tp_timeout();
+      loop = 0;
+    }
     missing_loc[i->first].insert(sources.begin(), sources.end());
     missing_loc_sources.insert(sources.begin(), sources.end());
   }
@@ -506,14 +514,16 @@ bool PG::MissingLoc::add_source_info(
   ThreadPool::TPHandle* handle)
 {
   bool found_missing = false;
+  unsigned loop = 0;
   // found items?
   for (map<hobject_t,pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator p = needs_recovery_map.begin();
        p != needs_recovery_map.end();
        ++p) {
     const hobject_t &soid(p->first);
     eversion_t need = p->second.need;
-    if (handle) {
+    if (handle && ++loop >= g_conf->osd_loop_before_reset_tphandle) {
       handle->reset_tp_timeout();
+      loop = 0;
     }
     if (oinfo.last_update < need) {
       dout(10) << "search_for_missing " << soid << " " << need
@@ -713,8 +723,10 @@ void PG::generate_past_intervals()
   epoch_t cur_epoch, end_epoch;
   if (!_calc_past_interval_range(&cur_epoch, &end_epoch,
       osd->get_superblock().oldest_map)) {
-    if (info.history.same_interval_since == 0)
+    if (info.history.same_interval_since == 0) {
       info.history.same_interval_since = end_epoch;
+      dirty_info = true;
+    }
     return;
   }
 
@@ -966,8 +978,9 @@ PG::Scrubber::~Scrubber() {}
  *  3) Prefer current primary
  */
 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
-  const map<pg_shard_t, pg_info_t> &infos) const
+  const map<pg_shard_t, pg_info_t> &infos, bool *history_les_bound) const
 {
+  assert(history_les_bound);
   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
    * to make changes to this process.  Also, make sure to update it
    * when you find bugs! */
@@ -978,6 +991,7 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
        ++i) {
     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
 	max_last_epoch_started_found < i->second.history.last_epoch_started) {
+      *history_les_bound = true;
       max_last_epoch_started_found = i->second.history.last_epoch_started;
     }
     if (!i->second.is_incomplete() &&
@@ -1283,7 +1297,7 @@ void PG::calc_replicated_acting(
  * calculate the desired acting, and request a change with the monitor
  * if it differs from the current acting.
  */
-bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
+bool PG::choose_acting(pg_shard_t &auth_log_shard_id, bool *history_les_bound)
 {
   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
   all_info[pg_whoami] = info;
@@ -1295,7 +1309,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
   }
 
   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
-    find_best_info(all_info);
+    find_best_info(all_info, history_les_bound);
 
   if (auth_log_shard == all_info.end()) {
     if (up != acting) {
@@ -1324,7 +1338,8 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
 	complete_infos.insert(*i);
     }
     map<pg_shard_t, pg_info_t>::const_iterator i = find_best_info(
-      complete_infos);
+      complete_infos,
+      history_les_bound);
     if (i != complete_infos.end()) {
       auth_log_shard = all_info.find(i->first);
     }
@@ -1797,7 +1812,7 @@ void PG::activate(ObjectStore::Transaction& t,
       // and covers vast majority of the use cases, like one OSD/host is down for
       // a while for hardware repairing
       if (complete_shards.size() + 1 == actingbackfill.size()) {
-        missing_loc.add_batch_sources_info(complete_shards);
+        missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
       } else {
         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
 				    get_sort_bitwise(), ctx->handle);
@@ -2003,12 +2018,7 @@ void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
     }
   }
 
-  if (dirty_info) {
-    ObjectStore::Transaction t;
-    write_if_dirty(t);
-    int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
-    assert(tr == 0);
-  }
+  assert(!dirty_info);
 
   unlock();
 }
@@ -2079,6 +2089,14 @@ bool PG::queue_scrub()
   return true;
 }
 
+unsigned PG::get_scrub_priority()
+{
+  // a higher value -> a higher priority
+  int pool_scrub_priority = 0;
+  pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
+  return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
+}
+
 struct C_PG_FinishRecovery : public Context {
   PGRef pg;
   explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
@@ -4624,7 +4642,8 @@ void PG::merge_new_log_entries(
 void PG::update_history_from_master(pg_history_t new_history)
 {
   unreg_next_scrub();
-  info.history.merge(new_history);
+  if (info.history.merge(new_history))
+    dirty_info = true;
   reg_next_scrub();
 }
 
@@ -4995,7 +5014,6 @@ void PG::start_peering_interval(
   state_clear(PG_STATE_RECOVERY_WAIT);
   state_clear(PG_STATE_RECOVERING);
 
-  peer_missing.clear();
   peer_purged.clear();
   actingbackfill.clear();
   snap_trim_queued = false;
@@ -5625,7 +5643,6 @@ boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& no
   PG *pg = context< RecoveryMachine >().pg;
   pg->proc_replica_info(
     notify.from, notify.notify.info, notify.notify.epoch_sent);
-  pg->update_heartbeat_peers();
   pg->set_last_peering_reset();
   return transit< Primary >();
 }
@@ -5889,7 +5906,8 @@ void PG::RecoveryState::Primary::exit()
 /*---------Peering--------*/
 PG::RecoveryState::Peering::Peering(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering")
+    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering"),
+    history_les_bound(false)
 {
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -5962,6 +5980,14 @@ boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
   }
   q.f->close_section();
 
+  if (history_les_bound) {
+    q.f->open_array_section("peering_blocked_by_detail");
+    q.f->open_object_section("item");
+    q.f->dump_string("detail","peering_blocked_by_history_les_bound");
+    q.f->close_section();
+    q.f->close_section();
+  }
+
   q.f->close_section();
   return forward_event();
 }
@@ -6328,7 +6354,7 @@ void PG::RecoveryState::RepRecovering::exit()
   PG *pg = context< RecoveryMachine >().pg;
   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
   utime_t dur = ceph_clock_now(pg->cct) - enter_time;
-  pg->osd->recoverystate_perf->tinc(rs_RepRecovering_latency, dur);
+  pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
 }
 
 /*------Activating--------*/
@@ -6385,9 +6411,6 @@ PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserve
 
   if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
     assert(*remote_recovery_reservation_it != pg->pg_whoami);
-  }
-
-  if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
     ConnectionRef con = pg->osd->get_con_osd_cluster(
       remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
@@ -6493,11 +6516,15 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
   // DEGRADED | UNDERSIZED is appropriate.
   assert(!pg->actingbackfill.empty());
   if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
-      pg->actingbackfill.size())
+      pg->actingbackfill.size()) {
     pg->state_clear(PG_STATE_DEGRADED);
+    pg->publish_stats_to_osd();
+  }
 
   // adjust acting set?  (e.g. because backfill completed...)
-  if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard))
+  bool history_les_bound = false;
+  if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
+						 &history_les_bound))
     assert(pg->want_acting.size());
 
   if (context< Active >().all_replicas_activated)
@@ -6621,6 +6648,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
     }
   }
 
+  bool need_publish = false;
   /* Check for changes in pool size (if the acting set changed as a result,
    * this does not matter) */
   if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
@@ -6636,16 +6664,19 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
       pg->state_set(PG_STATE_UNDERSIZED);
       pg->state_set(PG_STATE_DEGRADED);
     }
-    pg->publish_stats_to_osd(); // degraded may have changed
+    need_publish = true; // degraded may have changed
   }
 
   // if we haven't reported our PG stats in a long time, do so now.
   if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
     dout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
 	     << " epochs" << dendl;
-    pg->publish_stats_to_osd();
+    need_publish = true;
   }
 
+  if (need_publish)
+    pg->publish_stats_to_osd();
+
   return forward_event();
 }
     
@@ -6828,6 +6859,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
 
   // info.last_epoch_started is set during activate()
   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
+  pg->dirty_info = true;
 
   pg->share_pg_info();
   pg->publish_stats_to_osd();
@@ -7219,6 +7251,7 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
 	  if (!any_up_complete_now && any_down_now) {
 	    dout(10) << " no osds up+complete from interval " << interval << dendl;
 	    pg->state_set(PG_STATE_DOWN);
+            pg->publish_stats_to_osd();
 	    return discard_event();
 	  }
 	  break;
@@ -7280,7 +7313,8 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx)
   PG *pg = context< RecoveryMachine >().pg;
 
   // adjust acting?
-  if (!pg->choose_acting(auth_log_shard)) {
+  if (!pg->choose_acting(auth_log_shard,
+      &context< Peering >().history_les_bound)) {
     if (!pg->want_acting.empty()) {
       post_event(NeedActingChange());
     } else {
@@ -7916,6 +7950,14 @@ bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) c
       dout(10) << "affected_by_map osd." << o << " no longer exists" << dendl;
       return true;
     }
+    // did a down osd in down get (re)marked as lost?
+    map<int, epoch_t>::const_iterator r = blocked_by.find(o);
+    if (r != blocked_by.end()) {
+      if (osdmap->get_info(o).lost_at != r->second) {
+  dout(10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+  return true;
+      }
+    } 
   }
 
   return false;
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 1df129f..50a8d72 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -409,7 +409,8 @@ public:
 
     /// Adds recovery sources in batch
     void add_batch_sources_info(
-      const set<pg_shard_t> &sources  ///< [in] a set of resources which can be used for all objects
+      const set<pg_shard_t> &sources,  ///< [in] a set of resources which can be used for all objects
+      ThreadPool::TPHandle* handle  ///< [in] ThreadPool handle
       );
 
     /// Uses osdmap to update structures for now down sources
@@ -1037,7 +1038,8 @@ public:
   void trim_write_ahead();
 
   map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
-    const map<pg_shard_t, pg_info_t> &infos) const;
+    const map<pg_shard_t, pg_info_t> &infos,
+    bool *history_les_bound) const;
   static void calc_ec_acting(
     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
     unsigned size,
@@ -1066,7 +1068,8 @@ public:
     set<pg_shard_t> *acting_backfill,
     pg_shard_t *want_primary,
     ostream &ss);
-  bool choose_acting(pg_shard_t &auth_log_shard);
+  bool choose_acting(pg_shard_t &auth_log_shard,
+		     bool *history_les_bound);
   void build_might_have_unfound();
   void replay_queued_ops();
   void activate(
@@ -1306,11 +1309,6 @@ public:
     int seed,
     const pg_pool_t *pool,
     ObjectStore::Transaction *t) = 0;
-  virtual bool _report_snap_collection_errors(
-    const hobject_t &hoid,
-    const map<string, bufferptr> &attrs,
-    pg_shard_t osd,
-    ostream &out) { return false; }
   void clear_scrub_reserved();
   void scrub_reserve_replicas();
   void scrub_unreserve_replicas();
@@ -1728,6 +1726,7 @@ public:
 
     struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
       std::unique_ptr< PriorSet > prior_set;
+      bool history_les_bound;  //< need osd_find_best_info_ignore_history_les
 
       explicit Peering(my_context ctx);
       void exit();
@@ -2263,6 +2262,7 @@ public:
   void queue_snap_trim();
   bool requeue_scrub();
   bool queue_scrub();
+  unsigned get_scrub_priority();
 
   /// share pg info after a pg is active
   void share_pg_info();
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index a50dc51..3958f89 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -925,8 +925,10 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
       // fall through
 
     case CEPH_OSD_OP_PGNLS:
-      if (m->get_pg() != info.pgid.pgid) {
-        dout(10) << " pgnls pg=" << m->get_pg() << " != " << info.pgid << dendl;
+      if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
+        dout(10) << " pgnls pg=" << m->get_pg()
+		 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
+		 << " != " << info.pgid << dendl;
 	result = 0; // hmm?
       } else {
 	unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
@@ -946,7 +948,21 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 
 	hobject_t next;
 	hobject_t lower_bound = response.handle;
-        dout(10) << " pgnls lower_bound " << lower_bound << dendl;
+	hobject_t pg_start = info.pgid.pgid.get_hobj_start();
+	hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+        dout(10) << " pgnls lower_bound " << lower_bound
+		 << " pg_end " << pg_end << dendl;
+	if (get_sort_bitwise() &&
+	    ((lower_bound != hobject_t::get_max() &&
+	      cmp_bitwise(lower_bound, pg_end) >= 0) ||
+	     (lower_bound != hobject_t() &&
+	      cmp_bitwise(lower_bound, pg_start) < 0))) {
+	  // this should only happen with a buggy client.
+	  dout(10) << "outside of PG bounds " << pg_start << " .. "
+		   << pg_end << dendl;
+	  result = -EINVAL;
+	  break;
+	}
 
 	hobject_t current = lower_bound;
 	osr->flush();
@@ -1081,7 +1097,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	} else {
           response.handle = next;
         }
-        dout(10) << "pgls handle=" << response.handle << dendl;
+        dout(10) << "pgnls handle=" << response.handle << dendl;
 	::encode(response, osd_op.outdata);
 	if (filter)
 	  ::encode(filter_out, osd_op.outdata);
@@ -1113,8 +1129,10 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
       // fall through
 
     case CEPH_OSD_OP_PGLS:
-      if (m->get_pg() != info.pgid.pgid) {
-        dout(10) << " pgls pg=" << m->get_pg() << " != " << info.pgid << dendl;
+      if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
+        dout(10) << " pgls pg=" << m->get_pg()
+		 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
+		 << " != " << info.pgid << dendl;
 	result = 0; // hmm?
       } else {
 	unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
@@ -2255,6 +2273,7 @@ ReplicatedPG::cache_result_t ReplicatedPG::maybe_handle_cache_detail(
     return cache_result_t::NOOP;
 
   case pg_pool_t::CACHEMODE_FORWARD:
+    // FIXME: this mode allows requests to be reordered.
     do_cache_redirect(op);
     return cache_result_t::HANDLED_REDIRECT;
 
@@ -2289,6 +2308,28 @@ ReplicatedPG::cache_result_t ReplicatedPG::maybe_handle_cache_detail(
     do_cache_redirect(op);
     return cache_result_t::HANDLED_REDIRECT;
 
+  case pg_pool_t::CACHEMODE_PROXY:
+    if (!must_promote) {
+      if (op->may_write() || op->may_cache() || write_ordered) {
+	if (can_proxy_write) {
+	  do_proxy_write(op, missing_oid);
+	  return cache_result_t::HANDLED_PROXY;
+	}
+      } else {
+	do_proxy_read(op);
+	return cache_result_t::HANDLED_PROXY;
+      }
+    }
+    // ugh, we're forced to promote.
+    if (agent_state &&
+	agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+      dout(20) << __func__ << " cache pool full, waiting" << dendl;
+      block_write_on_full_cache(missing_oid, op);
+      return cache_result_t::BLOCKED_FULL;
+    }
+    promote_object(obc, missing_oid, oloc, op, promote_obc);
+    return cache_result_t::BLOCKED_PROMOTE;
+
   case pg_pool_t::CACHEMODE_READPROXY:
     // Do writeback to the cache tier for writes
     if (op->may_write() || write_ordered || must_promote) {
@@ -5516,7 +5557,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
 	set<string> out_set;
 
-	if (pool.info.supports_omap()) {
+	if (oi.is_omap()) {
 	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
 	    coll, ghobject_t(soid)
 	    );
@@ -5553,7 +5594,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
 	map<string, bufferlist> out_set;
 
-	if (pool.info.supports_omap()) {
+	if (oi.is_omap()) {
 	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
 	    coll, ghobject_t(soid)
 	    );
@@ -5579,7 +5620,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
     case CEPH_OSD_OP_OMAPGETHEADER:
       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
-      if (!pool.info.supports_omap()) {
+      if (!oi.is_omap()) {
 	// return empty header
 	break;
       }
@@ -5605,7 +5646,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
 	map<string, bufferlist> out;
-	if (pool.info.supports_omap()) {
+	if (oi.is_omap()) {
 	  osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
 	} // else return empty omap entries
 	::encode(out, osd_op.outdata);
@@ -5635,7 +5676,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	
 	map<string, bufferlist> out;
 
-	if (pool.info.supports_omap()) {
+	if (oi.is_omap()) {
 	  set<string> to_get;
 	  for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
 	       i != assertions.end();
@@ -5761,10 +5802,13 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -ENOENT;
 	  break;
 	}
-	t->omap_clear(soid);
-	ctx->delta_stats.num_wr++;
+	if (oi.is_omap()) {
+	  t->omap_clear(soid);
+	  ctx->delta_stats.num_wr++;
+	  obs.oi.clear_omap_digest();
+	  obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+	}
       }
-      obs.oi.clear_omap_digest();
       break;
 
     case CEPH_OSD_OP_OMAPRMKEYS:
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 96e6795..c5f7906 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -966,7 +966,9 @@ static opt_mapping_t opt_mapping = boost::assign::map_list_of
            ("recovery_priority", pool_opts_t::opt_desc_t(
              pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
            ("recovery_op_priority", pool_opts_t::opt_desc_t(
-             pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT));
+             pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
+           ("scrub_priority", pool_opts_t::opt_desc_t(
+             pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT));
 
 bool pool_opts_t::is_opt_name(const std::string& name) {
     return opt_mapping.find(name) != opt_mapping.end();
@@ -2244,9 +2246,14 @@ void pg_stat_t::dump(Formatter *f) const
   f->dump_stream("last_deep_scrub") << last_deep_scrub;
   f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
   f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
-  f->dump_unsigned("log_size", log_size);
-  f->dump_unsigned("ondisk_log_size", ondisk_log_size);
-  f->dump_stream("stats_invalid") << stats_invalid;
+  f->dump_int("log_size", log_size);
+  f->dump_int("ondisk_log_size", ondisk_log_size);
+  f->dump_bool("stats_invalid", stats_invalid);
+  f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
+  f->dump_bool("omap_stats_invalid", omap_stats_invalid);
+  f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
+  f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
+  f->dump_bool("pin_stats_invalid", pin_stats_invalid);
   stats.dump(f);
   f->open_array_section("up");
   for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index adfe50f..2dd16b0 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -963,7 +963,8 @@ public:
     SCRUB_MAX_INTERVAL,
     DEEP_SCRUB_INTERVAL,
     RECOVERY_PRIORITY,
-    RECOVERY_OP_PRIORITY
+    RECOVERY_OP_PRIORITY,
+    SCRUB_PRIORITY
   };
 
   enum type_t {
@@ -1125,7 +1126,8 @@ struct pg_pool_t {
     CACHEMODE_FORWARD = 2,               ///< forward if not in cache
     CACHEMODE_READONLY = 3,              ///< handle reads, forward writes [not strongly consistent]
     CACHEMODE_READFORWARD = 4,           ///< forward reads, write to cache flush later
-    CACHEMODE_READPROXY = 5              ///< proxy reads, write to cache flush later
+    CACHEMODE_READPROXY = 5,             ///< proxy reads, write to cache flush later
+    CACHEMODE_PROXY = 6,                 ///< proxy if not in cache
   } cache_mode_t;
   static const char *get_cache_mode_name(cache_mode_t m) {
     switch (m) {
@@ -1135,6 +1137,7 @@ struct pg_pool_t {
     case CACHEMODE_READONLY: return "readonly";
     case CACHEMODE_READFORWARD: return "readforward";
     case CACHEMODE_READPROXY: return "readproxy";
+    case CACHEMODE_PROXY: return "proxy";
     default: return "unknown";
     }
   }
@@ -1151,6 +1154,8 @@ struct pg_pool_t {
       return CACHEMODE_READFORWARD;
     if (s == "readproxy")
       return CACHEMODE_READPROXY;
+    if (s == "proxy")
+      return CACHEMODE_PROXY;
     return (cache_mode_t)-1;
   }
   const char *get_cache_mode_name() const {
@@ -1161,6 +1166,7 @@ struct pg_pool_t {
     case CACHEMODE_NONE:
     case CACHEMODE_FORWARD:
     case CACHEMODE_READONLY:
+    case CACHEMODE_PROXY:
       return false;
     case CACHEMODE_WRITEBACK:
     case CACHEMODE_READFORWARD:
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 56b91e5..b208254 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -2276,11 +2276,6 @@ ceph_tid_t Objecter::_op_submit(Op *op, shunique_lock& sul)
   _send_op_account(op);
 
   // send?
-  ldout(cct, 10) << "_op_submit oid " << op->target.base_oid
-		 << " '" << op->target.base_oloc << "' '"
-		 << op->target.target_oloc << "' " << op->ops << " tid "
-		 << op->tid << " osd." << (!s->is_homeless() ? s->osd : -1)
-		 << dendl;
 
   assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));
 
@@ -2321,6 +2316,13 @@ ceph_tid_t Objecter::_op_submit(Op *op, shunique_lock& sul)
   OSDSession::unique_lock sl(s->lock);
   if (op->tid == 0)
     op->tid = last_tid.inc();
+
+  ldout(cct, 10) << "_op_submit oid " << op->target.base_oid
+		 << " '" << op->target.base_oloc << "' '"
+		 << op->target.target_oloc << "' " << op->ops << " tid "
+		 << op->tid << " osd." << (!s->is_homeless() ? s->osd : -1)
+		 << dendl;
+
   _session_op_assign(s, op);
 
   if (need_send) {
@@ -2661,7 +2663,15 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,
       t->osd = -1;
       return RECALC_OP_TARGET_POOL_DNE;
     }
-    pgid = osdmap->raw_pg_to_pg(t->base_pgid);
+    if (osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+      // if the SORTBITWISE flag is set, we know all OSDs are running
+      // jewel+.
+      pgid = t->base_pgid;
+    } else {
+      // legacy behavior.  pre-jewel OSDs will fail if we send a
+      // full-hash pgid value.
+      pgid = osdmap->raw_pg_to_pg(t->base_pgid);
+    }
   } else {
     int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc,
 					   pgid);
diff --git a/src/osdc/Striper.cc b/src/osdc/Striper.cc
index 851f067..2c83520 100644
--- a/src/osdc/Striper.cc
+++ b/src/osdc/Striper.cc
@@ -402,7 +402,7 @@ void Striper::StripedReadResult::assemble_result(CephContext *cct, char *buffer,
   uint64_t end = p->first + p->second.second;
   while (p != partial.rend()) {
     // sanity check
-    ldout(cct, 0) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
+    ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
 		   << " " << p->second.first.length() << " bytes"
 		   << dendl;
     assert(p->first == end - p->second.second);
diff --git a/src/pybind/Makefile.am b/src/pybind/Makefile.am
index 8f0a5bd..cfef3ae 100644
--- a/src/pybind/Makefile.am
+++ b/src/pybind/Makefile.am
@@ -2,11 +2,14 @@
 if ENABLE_CLIENT
 if WITH_CYTHON
 
+CYTHON_BUILD_DIR="$(shell readlink -f $(builddir))/build"
+
 PY_DISTUTILS = \
+	mkdir -p $(CYTHON_BUILD_DIR); \
 	CPPFLAGS="-iquote \${abs_srcdir}/include ${AM_CPPFLAGS} ${CPPFLAGS}" \
 	CFLAGS="-iquote \${abs_srcdir}/include ${AM_CFLAGS} ${PYTHON_CFLAGS}" \
 	LDFLAGS="-L\${abs_builddir}/.libs $(subst -pie,,${AM_LDFLAGS}) ${PYTHON_LDFLAGS}" \
-	CYTHON_BUILD_DIR="$(shell readlink -f $(builddir))/build" \
+	CYTHON_BUILD_DIR=$(CYTHON_BUILD_DIR) \
 	${PYTHON} ./setup.py
 
 if WITH_RADOS
diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py
index 7d5f4cf..021c53d 100644
--- a/src/pybind/ceph_argparse.py
+++ b/src/pybind/ceph_argparse.py
@@ -120,6 +120,9 @@ class CephArgtype(object):
         """
         return '<{0}>'.format(self.__class__.__name__)
 
+    def complete(self, s):
+        return []
+
 
 class CephInt(CephArgtype):
     """
@@ -219,6 +222,12 @@ class CephString(CephArgtype):
             b += '(goodchars {0})'.format(self.goodchars)
         return '<string{0}>'.format(b)
 
+    def complete(self, s):
+        if s == '':
+            return []
+        else:
+            return [s]
+
 
 class CephSocketpath(CephArgtype):
     """
@@ -450,6 +459,10 @@ class CephChoices(CephArgtype):
         else:
             return '{0}'.format('|'.join(self.strings))
 
+    def complete(self, s):
+        all_elems = [token for token in self.strings if token.startswith(s)]
+        return all_elems
+
 
 class CephFilepath(CephArgtype):
     """
@@ -536,6 +549,12 @@ class CephPrefix(CephArgtype):
     def __str__(self):
         return self.prefix
 
+    def complete(self, s):
+        if self.prefix.startswith(s):
+            return [self.prefix.rstrip(' ')]
+        else:
+            return []
+
 
 class argdesc(object):
     """
@@ -618,6 +637,9 @@ class argdesc(object):
             s = '{' + s + '}'
         return s
 
+    def complete(self, s):
+        return self.instance.complete(s)
+
 
 def concise_sig(sig):
     """
diff --git a/src/pybind/cephfs/cephfs.pyx b/src/pybind/cephfs/cephfs.pyx
index 7121636..d3b581b 100644
--- a/src/pybind/cephfs/cephfs.pyx
+++ b/src/pybind/cephfs/cephfs.pyx
@@ -123,6 +123,7 @@ cdef extern from "cephfs/libcephfs.h" nogil:
                       void *value, size_t size)
     int ceph_write(ceph_mount_info *cmount, int fd, const char *buf, int64_t size, int64_t offset)
     int ceph_read(ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset)
+    int ceph_flock(ceph_mount_info *cmount, int fd, int operation, uint64_t owner)
     int ceph_close(ceph_mount_info *cmount, int fd)
     int ceph_open(ceph_mount_info *cmount, const char *path, int flags, mode_t mode)
     int ceph_mkdir(ceph_mount_info *cmount, const char *path, mode_t mode)
@@ -182,16 +183,23 @@ class IncompleteWriteError(Error):
 class LibCephFSStateError(Error):
     pass
 
+class WouldBlock(Error):
+    pass
+
+class OutOfRange(Error):
+    pass
 
 cdef errno_to_exception =  {
-    errno.EPERM     : PermissionError,
-    errno.ENOENT    : ObjectNotFound,
-    errno.EIO       : IOError,
-    errno.ENOSPC    : NoSpace,
-    errno.EEXIST    : ObjectExists,
-    errno.ENODATA   : NoData,
-    errno.EINVAL    : InvalidValue,
-    errno.EOPNOTSUPP: OperationNotSupported,
+    errno.EPERM      : PermissionError,
+    errno.ENOENT     : ObjectNotFound,
+    errno.EIO        : IOError,
+    errno.ENOSPC     : NoSpace,
+    errno.EEXIST     : ObjectExists,
+    errno.ENODATA    : NoData,
+    errno.EINVAL     : InvalidValue,
+    errno.EOPNOTSUPP : OperationNotSupported,
+    errno.ERANGE     : OutOfRange,
+    errno.EWOULDBLOCK: WouldBlock,
 }
 
 
@@ -689,7 +697,25 @@ cdef class LibCephFS(object):
             raise make_ex(ret, "error in write")
         return ret
 
-    def getxattr(self, path, name):
+    def flock(self, fd, operation, owner):
+        self.require_state("mounted")
+        if not isinstance(fd, int):
+            raise TypeError('fd must be an int')
+        if not isinstance(operation, int):
+            raise TypeError('operation must be an int')
+
+        cdef:
+            int _fd = fd
+            int _op = operation
+            uint64_t _owner = owner
+
+        with nogil:
+            ret = ceph_flock(self.cluster, _fd, _op, _owner)
+        if ret < 0:
+            raise make_ex(ret, "error in write")
+        return ret
+
+    def getxattr(self, path, name, size=255):
         self.require_state("mounted")
 
         path = cstr(path, 'path')
@@ -699,7 +725,7 @@ cdef class LibCephFS(object):
             char* _path = path
             char* _name = name
 
-            size_t ret_length = 255
+            size_t ret_length = size
             char *ret_buf = NULL
 
         try:
@@ -711,14 +737,6 @@ cdef class LibCephFS(object):
             if ret < 0:
                 raise make_ex(ret, "error in getxattr")
 
-            if ret > ret_length:
-                ret_buf = <char *>realloc_chk(ret_buf, ret)
-                with nogil:
-                    ret = ceph_getxattr(self.cluster, _path, _name, ret_buf,
-                                        ret)
-                if ret < 0:
-                    raise make_ex(ret, "error in getxattr")
-
             return ret_buf[:ret]
         finally:
             free(ret_buf)
diff --git a/src/pybind/cephfs/setup.py b/src/pybind/cephfs/setup.py
index cf29229..73488eb 100755
--- a/src/pybind/cephfs/setup.py
+++ b/src/pybind/cephfs/setup.py
@@ -42,7 +42,8 @@ setup(
     ext_modules = cythonize([
         Extension("cephfs",
             ["cephfs.pyx"],
-            libraries=["cephfs"]
+            libraries=["cephfs"],
+            language="c++"
             )
     ], build_dir=os.environ.get("CYTHON_BUILD_DIR", None), include_path=[
         os.path.join(os.path.dirname(__file__), "..", "rados")]
diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx
index 86a4cb8..1f2a049 100644
--- a/src/pybind/rados/rados.pyx
+++ b/src/pybind/rados/rados.pyx
@@ -2025,7 +2025,7 @@ cdef class Ioctx(object):
 
         :raises: IoctxStateError
         """
-        if self.state != b"open":
+        if self.state != "open":
             raise IoctxStateError("The pool is %s" % self.state)
 
     def change_auid(self, auid):
diff --git a/src/pybind/rados/setup.py b/src/pybind/rados/setup.py
index 6a7b9bd..2b0f0b6 100755
--- a/src/pybind/rados/setup.py
+++ b/src/pybind/rados/setup.py
@@ -42,7 +42,8 @@ setup(
     ext_modules = cythonize([
         Extension("rados",
             ["rados.pyx"],
-            libraries=["rados"]
+            libraries=["rados"],
+            language="c++"
             )
     ], build_dir=os.environ.get("CYTHON_BUILD_DIR", None)),
     cmdclass={
diff --git a/src/pybind/rbd/setup.py b/src/pybind/rbd/setup.py
index ec2f3a2..7ca00b4 100755
--- a/src/pybind/rbd/setup.py
+++ b/src/pybind/rbd/setup.py
@@ -42,7 +42,8 @@ setup(
     ext_modules = cythonize([
         Extension("rbd",
             ["rbd.pyx"],
-            libraries=["rbd"]
+            libraries=["rbd"],
+            language="c++"
             )
     ], build_dir=os.environ.get("CYTHON_BUILD_DIR", None), include_path=[
         os.path.join(os.path.dirname(__file__), "..", "rados")]
diff --git a/src/rbdmap b/src/rbdmap
index 09145b2..da60b31 100755
--- a/src/rbdmap
+++ b/src/rbdmap
@@ -1,8 +1,12 @@
 #!/bin/sh
 
 do_map() {
+
+        # default to reasonable value if RBDMAPFILE not set in environment
+        printenv RBDMAPFILE >/dev/null || local RBDMAPFILE=/etc/ceph/rbdmap
+
 	if [ ! -f "$RBDMAPFILE" ]; then
-		logger -p "daemon.warning" -t init-rbdmap "No $RBDMAPFILE found."
+		logger -p "daemon.warning" -t rbdmap "No $RBDMAPFILE found."
 		exit 0
 	fi
 
@@ -19,7 +23,7 @@ do_map() {
 			DEV=rbd/$DEV
 			;;
 		esac
-		logger -p "daemon.debug" -t init-rbdmap "Mapping '${DEV}'"
+		logger -p "daemon.debug" -t rbdmap "Mapping '${DEV}'"
 		newrbd=""
 		MAP_RV=""
 		OIFS=$IFS
@@ -37,22 +41,22 @@ do_map() {
 			    newrbd="yes"
 			else
 			    RET=$((${RET}+$?))
-			    logger -p "daemon.warning" -t init-rbdmap "Failed to map '${DEV}"
+			    logger -p "daemon.warning" -t rbdmap "Failed to map '${DEV}"
 			    continue
 			fi
 		fi
-		logger -p "daemon.debug" -t init-rbdmap "Mapped '${DEV}' to '${MAP_RV}'"
+		logger -p "daemon.debug" -t rbdmap "Mapped '${DEV}' to '${MAP_RV}'"
 
 		if [ "$newrbd" ]; then
 			## Mount new rbd
 			MNT_RV=""
 			mount --fake /dev/rbd/$DEV >>/dev/null 2>&1 \
 			&& MNT_RV=$(mount -vn /dev/rbd/$DEV 2>&1)
-			[ -n "${MNT_RV}" ] && logger -p "daemon.debug" -t init-rbdmap "Mounted '${MAP_RV}' to '${MNT_RV}'"
+			[ -n "${MNT_RV}" ] && logger -p "daemon.debug" -t rbdmap "Mounted '${MAP_RV}' to '${MNT_RV}'"
 
 			## post-mapping
 			if [ -x "/etc/ceph/rbd.d/${DEV}" ]; then
-			    logger -p "daemon.debug" -t init-rbdmap "Running post-map hook '/etc/ceph/rbd.d/${DEV}'"
+			    logger -p "daemon.debug" -t rbdmap "Running post-map hook '/etc/ceph/rbd.d/${DEV}'"
 			    /etc/ceph/rbd.d/${DEV} map "/dev/rbd/${DEV}"
 			fi
 		fi
@@ -71,32 +75,32 @@ do_unmap() {
 			    LL="${L##/dev/rbd/}"
 			    if [ "$(readlink -f $L)" = "${DEV}" ] \
 			    && [ -x "/etc/ceph/rbd.d/${LL}" ]; then
-			        logger -p "daemon.debug" -t init-rbdmap "Running pre-unmap hook for '${DEV}': '/etc/ceph/rbd.d/${LL}'"
+			        logger -p "daemon.debug" -t rbdmap "Running pre-unmap hook for '${DEV}': '/etc/ceph/rbd.d/${LL}'"
 			        /etc/ceph/rbd.d/${LL} unmap "$L"
 			        break
 			    fi
 			done
 
-			logger -p "daemon.debug" -t init-rbdmap "Unmapping '${DEV}'"
+			logger -p "daemon.debug" -t rbdmap "Unmapping '${DEV}'"
 			MNT=$(findmnt --mtab --source ${DEV} --noheadings | awk '{print $1'})
 			if [ -n "${MNT}" ]; then
-			    logger -p "daemon.debug" -t init-rbdmap "Unmounting '${MNT}'"
+			    logger -p "daemon.debug" -t rbdmap "Unmounting '${MNT}'"
 			    umount "${MNT}" >>/dev/null 2>&1
 			fi
 			if mountpoint -q "${MNT}"; then
 			    ## Un-mounting failed.
-			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmount '${MNT}'"
+			    logger -p "daemon.warning" -t rbdmap "Failed to unmount '${MNT}'"
 			    RET=$((${RET}+1))
 			    continue
 			fi
 			## Un-mapping.
 			rbd unmap $DEV >>/dev/null 2>&1
 			if [ $? -ne 0 ]; then
-			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmap '${MNT}'"
+			    logger -p "daemon.warning" -t rbdmap "Failed to unmap '${MNT}'"
 			    RET=$((${RET}+$?))
 			    continue
 			fi
-			logger -p "daemon.debug" -t init-rbdmap "Unmapped '${DEV}'"
+			logger -p "daemon.debug" -t rbdmap "Unmapped '${DEV}'"
 		done
 	fi
 	exit ${RET}
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 0bb5570..1f3cb61 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -108,6 +108,7 @@ void _usage()
   cout << "  zonegroup-map get          show zonegroup-map\n";
   cout << "  zonegroup-map set          set zonegroup-map (requires infile)\n";
   cout << "  zone create                create a new zone\n";
+  cout << "  zone delete                delete a zone\n";
   cout << "  zone get                   show zone cluster params\n";
   cout << "  zone modify                set/clear zone master status\n";
   cout << "  zone set                   set zone cluster params (requires infile)\n";
@@ -150,7 +151,7 @@ void _usage()
   cout << "  replicalog get             get replica metadata log entry\n";
   cout << "  replicalog update          update replica metadata log entry\n";
   cout << "  replicalog delete          delete replica metadata log entry\n";
-  cout << "  orphans find               init and run search for leaked rados objects\n";
+  cout << "  orphans find               init and run search for leaked rados objects (use job-id, pool)\n";
   cout << "  orphans finish             clean up search for leaked rados objects\n";
   cout << "options:\n";
   cout << "   --tenant=<tenant>         tenant name\n";
@@ -183,7 +184,6 @@ void _usage()
   cout << "                               replica datalog get/delete\n";
   cout << "   --metadata-key=<key>      key to retrieve metadata from with metadata get\n";
   cout << "   --remote=<remote>         remote to pull period\n";
-  cout << "   --parent=<id>             parent period id\n";
   cout << "   --period=<id>             period id\n";
   cout << "   --epoch=<number>          period epoch\n";
   cout << "   --commit                  commit the period during 'period update'\n";
@@ -196,8 +196,11 @@ void _usage()
   cout << "   --realm-new-name=<realm new name> realm new name\n";
   cout << "   --rgw-zonegroup=<zonegroup>   zonegroup name\n";
   cout << "   --rgw-zone=<zone>         zone in which radosgw is running\n";
+  cout << "   --zone-id=<zone id>       zone id\n";
   cout << "   --zone-new-name=<zone>    zone new name\n";
+  cout << "   --source-zone             specify the source zone (for data sync)\n";
   cout << "   --default                 set entity (realm, zonegroup, zone) as default\n";
+  cout << "   --read-only               set zone as read-only (when adding to zonegroup)\n";
   cout << "   --endpoints=<list>        zone endpoints\n";
   cout << "   --fix                     besides checking bucket index, will also fix it\n";
   cout << "   --check-objects           bucket check: rebuilds bucket index according to\n";
@@ -234,6 +237,8 @@ void _usage()
   cout << "\nOrphans search options:\n";
   cout << "   --pool                    data pool to scan for leaked rados objects in\n";
   cout << "   --num-shards              num of shards to use for keeping the temporary scan info\n";
+  cout << "   --job-id                  set the job id (for orphans find)\n";
+  cout << "   --max-concurrent-ios      maximum concurrent ios for orphans find (default: 32)\n";
   cout << "\n";
   generic_client_usage();
 }
@@ -331,6 +336,7 @@ enum {
   OPT_SYNC_ERROR_LIST,
   OPT_BILOG_LIST,
   OPT_BILOG_TRIM,
+  OPT_BILOG_STATUS,
   OPT_DATA_SYNC_STATUS,
   OPT_DATA_SYNC_INIT,
   OPT_DATA_SYNC_RUN,
@@ -681,6 +687,8 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
       return OPT_BILOG_LIST;
     if (strcmp(cmd, "trim") == 0)
       return OPT_BILOG_TRIM;
+    if (strcmp(cmd, "status") == 0)
+      return OPT_BILOG_STATUS;
   } else if (strcmp(prev_cmd, "data") == 0) {
     if (strcmp(cmd, "sync") == 0) {
       *need_more = true;
@@ -1316,15 +1324,13 @@ static int send_to_remote_gateway(const string& remote, req_info& info,
   }
   rgw_user user;
   int ret = conn->forward(user, info, NULL, MAX_REST_RESPONSE, &in_data, &response);
-  if (ret < 0) {
-    return ret;
-  }
-  ret = parser.parse(response.c_str(), response.length());
-  if (ret < 0) {
+
+  int parse_ret = parser.parse(response.c_str(), response.length());
+  if (parse_ret < 0) {
     cerr << "failed to parse response" << std::endl;
-    return ret;
+    return parse_ret;
   }
-  return 0;
+  return ret;
 }
 
 static int send_to_url(const string& url, RGWAccessKey& key, req_info& info,
@@ -1335,15 +1341,13 @@ static int send_to_url(const string& url, RGWAccessKey& key, req_info& info,
 
   bufferlist response;
   int ret = req.forward_request(key, info, MAX_REST_RESPONSE, &in_data, &response);
-  if (ret < 0) {
-    return ret;
-  }
-  ret = parser.parse(response.c_str(), response.length());
-  if (ret < 0) {
+
+  int parse_ret = parser.parse(response.c_str(), response.length());
+  if (parse_ret < 0) {
     cout << "failed to parse response" << std::endl;
-    return ret;
+    return parse_ret;
   }
-  return 0;
+  return ret;
 }
 
 static int send_to_remote_or_url(const string& remote, const string& url,
@@ -1385,7 +1389,7 @@ static int commit_period(RGWRealm& realm, RGWPeriod& period,
       return ret;
     }
     // the master zone can commit locally
-    ret = period.commit(realm, current_period);
+    ret = period.commit(realm, current_period, cerr);
     if (ret < 0) {
       cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
     }
@@ -1410,6 +1414,12 @@ static int commit_period(RGWRealm& realm, RGWPeriod& period,
   int ret = send_to_remote_or_url(remote, url, access, secret, info, bl, p);
   if (ret < 0) {
     cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
+
+    // did we parse an error message?
+    auto message = p.find_obj("Message");
+    if (message) {
+      cerr << "Reason: " << message->get_data() << std::endl;
+    }
     return ret;
   }
 
@@ -1928,7 +1938,7 @@ int main(int argc, char **argv)
   std::string date, subuser, access, format;
   std::string start_date, end_date;
   std::string key_type_str;
-  std::string period_id, period_epoch, remote, url, parent_period;
+  std::string period_id, period_epoch, remote, url;
   std::string master_zonegroup, master_zone;
   std::string realm_name, realm_id, realm_new_name;
   std::string zone_name, zone_id, zone_new_name;
@@ -2246,8 +2256,6 @@ int main(int argc, char **argv)
         cerr << "ERROR: invalid bucket index entry type" << std::endl;
         return EINVAL;
       }
-    } else if (ceph_argparse_witharg(args, i, &val, "--parent", (char*)NULL)) {
-      parent_period = val;
     } else if (ceph_argparse_binary_flag(args, i, &is_master_int, NULL, "--master", (char*)NULL)) {
       is_master = (bool)is_master_int;
       is_master_set = true;
@@ -2744,6 +2752,11 @@ int main(int argc, char **argv)
                                         info, bl, p);
         if (ret < 0) {
           cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
+          if (ret == -EACCES) {
+            cerr << "If the realm has been changed on the master zone, the "
+                "master zone's gateway may need to be restarted to recognize "
+                "this user." << std::endl;
+          }
           return ret;
         }
         RGWRealm realm;
@@ -5224,6 +5237,29 @@ next:
     }
   }
 
+  if (opt_cmd == OPT_BILOG_STATUS) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return -EINVAL;
+    }
+    RGWBucketInfo bucket_info;
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    map<int, string> markers;
+    ret = store->get_bi_log_status(bucket, shard_id, markers);
+    if (ret < 0) {
+      cerr << "ERROR: trim_bi_log_entries(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    formatter->open_object_section("entries");
+    encode_json("markers", markers, formatter);
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
   if (opt_cmd == OPT_DATALOG_LIST) {
     formatter->open_array_section("entries");
     bool truncated;
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index 8f755d8..d4a03e1 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -53,8 +53,15 @@ int RGWMongoose::complete_request()
       /*
        * Status 204 should not include a content-length header
        * RFC7230 says so
+       *
+       * Same goes for status 304: Not Modified
+       *
+       * 'If a cache uses a received 304 response to update a cache entry,'
+       * 'the cache MUST update the entry to reflect any new field values'
+       * 'given in the response.'
+       *
        */
-      if (status_num == 204) {
+      if (status_num == 204 || status_num == 304) {
         has_content_length = true;
       } else if (0 && data.length() == 0) {
         has_content_length = true;
diff --git a/src/rgw/rgw_cr_rados.cc b/src/rgw/rgw_cr_rados.cc
index ee32052..4b91acf 100644
--- a/src/rgw/rgw_cr_rados.cc
+++ b/src/rgw/rgw_cr_rados.cc
@@ -460,6 +460,7 @@ int RGWAsyncFetchRemoteObj::_send_request()
                        user_id,
                        client_id,
                        op_id,
+                       false, /* don't record op state in ops log */
                        NULL, /* req_info */
                        source_zone,
                        dest_obj,
diff --git a/src/rgw/rgw_data_sync.cc b/src/rgw/rgw_data_sync.cc
index 3aa0f48..6d73b73 100644
--- a/src/rgw/rgw_data_sync.cc
+++ b/src/rgw/rgw_data_sync.cc
@@ -551,10 +551,10 @@ static string full_data_sync_index_shard_oid(const string& source_zone, int shar
 struct bucket_instance_meta_info {
   string key;
   obj_version ver;
-  time_t mtime;
+  utime_t mtime;
   RGWBucketInstanceMetadataObject data;
 
-  bucket_instance_meta_info() : mtime(0) {}
+  bucket_instance_meta_info() {}
 
   void decode_json(JSONObj *obj) {
     JSONDecoder::decode_json("key", key, obj);
@@ -1239,17 +1239,16 @@ public:
 
       yield {
         if  ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) {
-          case rgw_data_sync_info::StateSync:
-            for (map<uint32_t, rgw_data_sync_marker>::iterator iter = sync_status.sync_markers.begin();
-                 iter != sync_status.sync_markers.end(); ++iter) {
-              RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sync_env, sync_env->store->get_zone_params().log_pool,
-                                                        iter->first, iter->second);
-              cr->get();
-              shard_crs_lock.Lock();
-              shard_crs[iter->first] = cr;
-              shard_crs_lock.Unlock();
-              spawn(cr, true);
-            }
+          for (map<uint32_t, rgw_data_sync_marker>::iterator iter = sync_status.sync_markers.begin();
+               iter != sync_status.sync_markers.end(); ++iter) {
+            RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sync_env, sync_env->store->get_zone_params().log_pool,
+                                                                          iter->first, iter->second);
+            cr->get();
+            shard_crs_lock.Lock();
+            shard_crs[iter->first] = cr;
+            shard_crs_lock.Unlock();
+            spawn(cr, true);
+          }
         }
       }
 
@@ -2202,6 +2201,8 @@ class RGWBucketShardIncrementalSyncCR : public RGWCoroutine {
   string instance;
   string ns;
 
+  string cur_id;
+
 
 
 public:
@@ -2270,32 +2271,44 @@ int RGWBucketShardIncrementalSyncCR::operate()
       entries_iter = list_result.begin();
       for (; entries_iter != list_result.end(); ++entries_iter) {
         entry = &(*entries_iter);
-        inc_marker.position = entry->id;
+        {
+          ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */
+          if (p < 0) {
+            cur_id = entry->id;
+          } else {
+            cur_id = entry->id.substr(p + 1);
+          }
+        }
+        inc_marker.position = cur_id;
 
         if (!rgw_obj::parse_raw_oid(entries_iter->object, &name, &instance, &ns)) {
           set_status() << "parse_raw_oid() on " << entries_iter->object << " returned false, skipping entry";
           ldout(sync_env->cct, 20) << "parse_raw_oid() on " << entries_iter->object << " returned false, skipping entry" << dendl;
+          marker_tracker->try_update_high_marker(cur_id, 0, entries_iter->timestamp);
           continue;
         }
 
-        ldout(sync_env->cct, 20) << "parsed entry: iter->object=" << entries_iter->object << " iter->instance=" << entries_iter->instance << " name=" << name << " instance=" << instance << " ns=" << ns << dendl;
+        ldout(sync_env->cct, 20) << "parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << name << " instance=" << instance << " ns=" << ns << dendl;
 
         if (!ns.empty()) {
-          set_status() << "skipping entry in namespace: " << entries_iter->object;
-          ldout(sync_env->cct, 20) << "skipping entry in namespace: " << entries_iter->object << dendl;
+          set_status() << "skipping entry in namespace: " << entry->object;
+          ldout(sync_env->cct, 20) << "skipping entry in namespace: " << entry->object << dendl;
+          marker_tracker->try_update_high_marker(cur_id, 0, entry->timestamp);
           continue;
         }
 
-        key = rgw_obj_key(name, entries_iter->instance);
-        set_status() << "got entry.id=" << entry->id << " key=" << key << " op=" << (int)entry->op;
+        key = rgw_obj_key(name, entry->instance);
+        set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op;
         if (entry->op == CLS_RGW_OP_CANCEL) {
           set_status() << "canceled operation, skipping";
           ldout(sync_env->cct, 20) << "[inc sync] skipping object: " << bucket_name << ":" << bucket_id << ":" << shard_id << "/" << key << ": canceled operation" << dendl;
+          marker_tracker->try_update_high_marker(cur_id, 0, entry->timestamp);
           continue;
         }
         if (entry->state != CLS_RGW_STATE_COMPLETE) {
           set_status() << "non-complete operation, skipping";
           ldout(sync_env->cct, 20) << "[inc sync] skipping object: " << bucket_name << ":" << bucket_id << ":" << shard_id << "/" << key << ": non-complete operation" << dendl;
+          marker_tracker->try_update_high_marker(cur_id, 0, entry->timestamp);
           continue;
         }
         ldout(sync_env->cct, 20) << "[inc sync] syncing object: " << bucket_name << ":" << bucket_id << ":" << shard_id << "/" << key << dendl;
@@ -2309,26 +2322,26 @@ int RGWBucketShardIncrementalSyncCR::operate()
           yield wait_for_child();
           
         }
-        if (!marker_tracker->index_key_to_marker(key, entry->op, entry->id)) {
+        if (!marker_tracker->index_key_to_marker(key, entry->op, cur_id)) {
           set_status() << "can't do op, sync already in progress for object";
-          ldout(sync_env->cct, 20) << __func__ << ": skipping sync of entry: " << entry->id << ":" << key << " sync already in progress for object" << dendl;
-          marker_tracker->try_update_high_marker(entry->id, 0, entries_iter->timestamp);
+          ldout(sync_env->cct, 20) << __func__ << ": skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object" << dendl;
+          marker_tracker->try_update_high_marker(cur_id, 0, entry->timestamp);
           continue;
         }
         // yield {
           set_status() << "start object sync";
-          if (!marker_tracker->start(entry->id, 0, entries_iter->timestamp)) {
-            ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << entry->id << ". Duplicate entry?" << dendl;
+          if (!marker_tracker->start(cur_id, 0, entry->timestamp)) {
+            ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << cur_id << ". Duplicate entry?" << dendl;
           } else {
             uint64_t versioned_epoch = 0;
             bucket_entry_owner owner(entry->owner, entry->owner_display_name);
             if (entry->ver.pool < 0) {
               versioned_epoch = entry->ver.epoch;
             }
-ldout(sync_env->cct, 0) << __FILE__ << ":" << __LINE__ << " entry->timestamp=" << entry->timestamp << dendl;
+            ldout(sync_env->cct, 20) << __func__ << "(): entry->timestamp=" << entry->timestamp << dendl;
             spawn(new RGWBucketSyncSingleEntryCR<string, rgw_obj_key>(sync_env, bucket_info, shard_id,
                                                          key, entry->is_versioned(), versioned_epoch, entry->timestamp, owner, entry->op,
-                                                         entry->state, entry->id, marker_tracker), false);
+                                                         entry->state, cur_id, marker_tracker), false);
           }
         // }
         while ((int)num_spawned() > spawn_window) {
@@ -2345,9 +2358,20 @@ ldout(sync_env->cct, 0) << __FILE__ << ":" << __LINE__ << " entry->timestamp=" <
       }
     } while (!list_result.empty());
 
+    yield {
+      call(marker_tracker->flush());
+    }
+    if (retcode < 0) {
+      ldout(sync_env->cct, 0) << "ERROR: marker_tracker->flush() returned retcode=" << retcode << dendl;
+      lease_cr->go_down();
+      drain_all();
+      return set_cr_error(retcode);
+    }
+
     lease_cr->go_down();
     /* wait for all operations to complete */
     drain_all();
+
     return set_cr_done();
   }
   return 0;
diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc
index 53f764f..4c28867 100644
--- a/src/rgw/rgw_file.cc
+++ b/src/rgw/rgw_file.cc
@@ -253,6 +253,125 @@ namespace rgw {
     return rc;
   } /* RGWLibFS::rename */
 
+  MkObjResult RGWLibFS::mkdir(RGWFileHandle* parent, const char *name,
+			      struct stat *st, uint32_t mask, uint32_t flags)
+  {
+    MkObjResult mkr{nullptr, -EINVAL};
+    int rc, rc2;
+
+    LookupFHResult fhr;
+    RGWFileHandle* rgw_fh = nullptr;
+
+    if (parent->is_root()) {
+      /* bucket */
+      string bname{name};
+      /* enforce S3 name restrictions */
+      rc = valid_s3_bucket_name(bname, false /* relaxed */);
+      if (rc != 0) {
+	rc = -EINVAL;
+	goto out;
+      }
+
+      string uri = "/" + bname; /* XXX get rid of URI some day soon */
+      RGWCreateBucketRequest req(get_context(), get_user(), uri);
+      rc = rgwlib.get_fe()->execute_req(&req);
+      rc2 = req.get_ret();
+    } else {
+      /* create an object representing the directory */
+      buffer::list bl;
+      string dir_name = /* XXX get rid of this some day soon, too */
+	parent->relative_object_name();
+      /* creating objects w/leading '/' makes a mess */
+      if ((dir_name.size() > 0) &&
+	  (dir_name.back() != '/'))
+	dir_name += "/";
+      dir_name += name;
+      dir_name += "/";
+      RGWPutObjRequest req(get_context(), get_user(), parent->bucket_name(),
+			  dir_name, bl);
+      rc = rgwlib.get_fe()->execute_req(&req);
+      rc2 = req.get_ret();
+    }
+
+    if ((rc == 0) &&
+	(rc2 == 0)) {
+      fhr = lookup_fh(parent, name,
+		      RGWFileHandle::FLAG_CREATE|
+		      RGWFileHandle::FLAG_DIRECTORY);
+      rgw_fh = get<0>(fhr);
+      if (rgw_fh) {
+	/* XXX unify timestamps */
+	rgw_fh->create_stat(st, mask);
+	rgw_fh->set_times(real_clock::now());
+	rgw_fh->stat(st);
+	get<0>(mkr) = rgw_fh;
+      } else
+	rc = -EIO;
+    }
+
+  out:
+    get<1>(mkr) = rc;
+
+    return mkr;
+  } /* RGWLibFS::mkdir */
+
+  MkObjResult RGWLibFS::create(RGWFileHandle* parent, const char *name,
+			      struct stat *st, uint32_t mask, uint32_t flags)
+  {
+    int rc, rc2;
+
+    using std::get;
+
+    rgw_file_handle *lfh;
+    rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh,
+		    RGW_LOOKUP_FLAG_NONE);
+    if (! rc) {
+      /* conflict! */
+      rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE);
+      return MkObjResult{nullptr, -EEXIST};
+    }
+
+    /* expand and check name */
+    std::string obj_name{parent->relative_object_name()};
+    if ((obj_name.size() > 0) &&
+	(obj_name.back() != '/'))
+      obj_name += "/";
+    obj_name += name;
+    if (! valid_s3_object_name(obj_name)) {
+      return MkObjResult{nullptr, -EINVAL};
+    }
+
+    /* create it */
+    buffer::list bl;
+    RGWPutObjRequest req(cct, get_user(), parent->bucket_name(), obj_name, bl);
+    MkObjResult mkr{nullptr, -EINVAL};
+
+    rc = rgwlib.get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+
+    if ((rc == 0) &&
+	(rc2 == 0)) {
+      /* XXX atomicity */
+      LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_CREATE);
+      RGWFileHandle* rgw_fh = get<0>(fhr);
+      if (rgw_fh) {
+	if (get<1>(fhr) & RGWFileHandle::FLAG_CREATE) {
+	  /* fill in stat data */
+	  rgw_fh->create_stat(st, mask);
+	  rgw_fh->set_times(real_clock::now());
+	  rgw_fh->open_for_create(); // XXX needed?
+	}
+	(void) rgw_fh->stat(st);
+	get<0>(mkr) = rgw_fh;
+      } else
+	rc = -EIO;
+    }
+
+    get<1>(mkr) = rc;
+
+    return mkr;
+  } /* RGWLibFS::create */
+
   int RGWLibFS::getattr(RGWFileHandle* rgw_fh, struct stat* st)
   {
     switch(rgw_fh->fh.fh_type) {
@@ -417,6 +536,8 @@ namespace rgw {
 	new RGWWriteRequest(fs->get_context(), fs->get_user(), this,
 			    bucket_name(), object_name);
       rc = rgwlib.get_fe()->start_req(f->write_req);
+      if (rc < 0)
+        return -EIO;
     }
 
     buffer::list bl;
@@ -708,14 +829,15 @@ int rgw_statfs(struct rgw_fs *rgw_fs,
   generic create -- create an empty regular file
 */
 int rgw_create(struct rgw_fs *rgw_fs,
-	       struct rgw_file_handle *parent_fh,
-	       const char *name, mode_t mode, struct stat *st,
-	       struct rgw_file_handle **fh, uint32_t flags)
+	      struct rgw_file_handle *parent_fh,
+	      const char *name, struct stat *st, uint32_t mask,
+	      struct rgw_file_handle **fh, uint32_t flags)
 {
-  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
-  CephContext* cct = static_cast<CephContext*>(rgw_fs->rgw);
+  using std::get;
 
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
   RGWFileHandle* parent = get_rgwfh(parent_fh);
+
   if ((! parent) ||
       (parent->is_root()) ||
       (parent->is_file())) {
@@ -723,124 +845,41 @@ int rgw_create(struct rgw_fs *rgw_fs,
     return -EINVAL;
   }
 
-  using std::get;
+  MkObjResult fhr = fs->create(parent, name, st, mask, flags);
+  RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success
 
-  rgw_file_handle *lfh;
-  int rc = rgw_lookup(rgw_fs, parent_fh, name, &lfh,
-		      RGW_LOOKUP_FLAG_NONE);
-  if (! rc) {
-    /* conflict! */
-    rc = rgw_fh_rele(rgw_fs, lfh, RGW_FH_RELE_FLAG_NONE);
-    return -EEXIST;
-  } else {
-    /* expand and check name */
-    std::string obj_name{parent->relative_object_name()};
-    if ((obj_name.size() > 0) &&
-	(obj_name.back() != '/'))
-      obj_name += "/";
-    obj_name += name;
-    if (! valid_s3_object_name(obj_name)) {
-      return -EINVAL;
-    } else {
-      /* create it */
-      buffer::list bl;
-      RGWPutObjRequest req(cct, fs->get_user(), parent->bucket_name(),
-			   obj_name, bl);
-      rc = rgwlib.get_fe()->execute_req(&req);
-      int rc2 = req.get_ret();
-
-      if ((rc == 0) &&
-	  (rc2 == 0)) {
-	/* XXX atomicity */
-	LookupFHResult fhr = fs->lookup_fh(parent, name,
-					   RGWFileHandle::FLAG_CREATE);
-	RGWFileHandle* rgw_fh = get<0>(fhr);
-	if (rgw_fh) {
-	  if (get<1>(fhr) & RGWFileHandle::FLAG_CREATE) {
-	    /* fill in stat data */
-	    rgw_fh->set_times(real_clock::now());
-	    rgw_fh->open_for_create(); // XXX needed?
-	  }
-	  (void) rgw_fh->stat(st);
-	  struct rgw_file_handle *rfh = rgw_fh->get_fh();
-	  *fh = rfh;
-	} else
-	  rc = -EIO;
-      }
-    }
-  }
+  if (nfh)
+    *fh = nfh->get_fh();
 
-  return rc;
-}
+  return get<1>(fhr);
+} /* rgw_create */
 
 /*
   create a new directory
 */
 int rgw_mkdir(struct rgw_fs *rgw_fs,
 	      struct rgw_file_handle *parent_fh,
-	      const char *name, mode_t mode, struct stat *st,
+	      const char *name, struct stat *st, uint32_t mask,
 	      struct rgw_file_handle **fh, uint32_t flags)
 {
-  int rc, rc2;
+  using std::get;
 
   RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
-  CephContext* cct = static_cast<CephContext*>(rgw_fs->rgw);
-
   RGWFileHandle* parent = get_rgwfh(parent_fh);
+
   if (! parent) {
     /* bad parent */
     return -EINVAL;
   }
 
-  LookupFHResult fhr;
-  RGWFileHandle* rgw_fh = nullptr;
+  MkObjResult fhr = fs->mkdir(parent, name, st, mask, flags);
+  RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success
 
-  if (parent->is_root()) {
-    /* bucket */
-    string bname{name};
-    /* enforce S3 name restrictions */
-    rc = valid_s3_bucket_name(bname, false /* relaxed */);
-    if (rc != 0)
-      return -EINVAL;
-    string uri = "/" + bname; /* XXX get rid of URI some day soon */
-    RGWCreateBucketRequest req(cct, fs->get_user(), uri);
-    rc = rgwlib.get_fe()->execute_req(&req);
-    rc2 = req.get_ret();
-  } else {
-    /* create an object representing the directory */
-    buffer::list bl;
-    string dir_name = /* XXX get rid of this some day soon, too */
-      parent->relative_object_name();
-    /* creating objects w/leading '/' makes a mess */
-    if ((dir_name.size() > 0) &&
-	(dir_name.back() != '/'))
-      dir_name += "/";
-    dir_name += name;
-    dir_name += "/";
-    RGWPutObjRequest req(cct, fs->get_user(), parent->bucket_name(),
-			 dir_name, bl);
-    rc = rgwlib.get_fe()->execute_req(&req);
-    rc2 = req.get_ret();
-  }
-
-  if ((rc == 0) &&
-      (rc2 == 0)) {
-    fhr = fs->lookup_fh(parent, name,
-			RGWFileHandle::FLAG_CREATE|
-			RGWFileHandle::FLAG_DIRECTORY);
-    rgw_fh = get<0>(fhr);
-    if (rgw_fh) {
-      /* XXX unify timestamps */
-      rgw_fh->set_times(real_clock::now());
-      rgw_fh->stat(st);
-      struct rgw_file_handle *rfh = rgw_fh->get_fh();
-      *fh = rfh;
-    } else
-      rc = -EIO;
-  }
+  if (nfh)
+    *fh = nfh->get_fh();
 
-  return rc;
-}
+  return get<1>(fhr);
+} /* rgw_mkdir */
 
 /*
   rename object
@@ -1081,18 +1120,8 @@ int rgw_write(struct rgw_fs *rgw_fs,
   if (! rgw_fh->is_open())
     return -EPERM;
 
-  std::cout << __func__ << " before write of "
-	    << length << " bytes at offset " << offset
-	    << std::endl;
-
   rc = rgw_fh->write(offset, length, bytes_written, buffer);
 
-  std::cout << __func__ << " after write of "
-	    << length << " bytes at offset " << offset
-	    << " wrote " << *bytes_written
-	    << " rc " << rc
-	    << std::endl;
-
   return rc;
 }
 
diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h
index a6e06de..26b97b4 100644
--- a/src/rgw/rgw_file.h
+++ b/src/rgw/rgw_file.h
@@ -157,10 +157,14 @@ namespace rgw {
       uint64_t dev;
       size_t size;
       uint64_t nlink;
+      uint32_t owner_uid; /* XXX need Unix attr */
+      uint32_t owner_gid; /* XXX need Unix attr */
+      mode_t unix_mode;
       struct timespec ctime;
       struct timespec mtime;
       struct timespec atime;
-      state() : dev(0), size(0), nlink(1), ctime{0,0}, mtime{0,0}, atime{0,0} {}
+      state() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0),
+		ctime{0,0}, mtime{0,0}, atime{0,0} {}
     } state;
 
     struct file {
@@ -224,10 +228,11 @@ namespace rgw {
 	variant_type = directory();
 	/* stat */
 	state.dev = fs_inst;
+	state.unix_mode = RGW_RWXMODE|S_IFDIR;
 	/* pointer to self */
 	fh.fh_private = this;
       }
-    
+
     void init_rootfs(std::string& fsid, const std::string& object_name) {
       /* fh_key */
       fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed);
@@ -264,6 +269,19 @@ namespace rgw {
       /* save constant fhk */
       fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */
 
+      /* stat */
+      state.dev = fs_inst;
+
+      switch (fh.fh_type) {
+      case RGW_FS_TYPE_DIRECTORY:
+	state.unix_mode = RGW_RWXMODE|S_IFDIR;
+	break;
+      case RGW_FS_TYPE_FILE:
+	state.unix_mode = RGW_RWMODE|S_IFREG;
+      default:
+	break;
+      }
+
       /* pointer to self */
       fh.fh_private = this;
     }
@@ -290,16 +308,41 @@ namespace rgw {
 
     RGWFileHandle* get_parent() { return parent; }
 
+    uint32_t get_owner_uid() const { return state.owner_uid; }
+    uint32_t get_owner_gid() const { return state.owner_gid; }
+
     struct timespec get_mtime() const { return state.mtime; }
 
-    int stat(struct stat *st) {
+    void create_stat(struct stat* st, uint32_t mask) {
+      if (mask & RGW_SETATTR_UID)
+	state.owner_uid = st->st_uid;
+
+      if (mask & RGW_SETATTR_GID)
+	state.owner_gid = st->st_gid;
+
+      if (mask & RGW_SETATTR_MODE)  {
+	switch (fh.fh_type) {
+	case RGW_FS_TYPE_DIRECTORY:
+	  st->st_mode = state.unix_mode|S_IFDIR;
+	  break;
+	case RGW_FS_TYPE_FILE:
+	  st->st_mode = state.unix_mode|S_IFREG;
+      default:
+	break;
+	}
+      }
+    }
+
+    int stat(struct stat* st) {
       /* partial Unix attrs */
       memset(st, 0, sizeof(struct stat));
       st->st_dev = state.dev;
       st->st_ino = fh.fh_hk.object; // XXX
 
-      st->st_uid = 0; // XXX
-      st->st_gid = 0; // XXX
+      st->st_uid = state.owner_uid;
+      st->st_gid = state.owner_gid;
+
+      st->st_mode = state.unix_mode;
 
       st->st_atim = state.atime;
       st->st_mtim = state.mtime;
@@ -567,6 +610,7 @@ namespace rgw {
   }
 
   typedef std::tuple<RGWFileHandle*, uint32_t> LookupFHResult;
+  typedef std::tuple<RGWFileHandle*, int> MkObjResult;
 
   class RGWLibFS
   {
@@ -780,6 +824,12 @@ namespace rgw {
     int rename(RGWFileHandle* old_fh, RGWFileHandle* new_fh,
 	       const char *old_name, const char *new_name);
 
+    MkObjResult create(RGWFileHandle* parent, const char *name, struct stat *st,
+		      uint32_t mask, uint32_t flags);
+
+    MkObjResult mkdir(RGWFileHandle* parent, const char *name, struct stat *st,
+		      uint32_t mask, uint32_t flags);
+
     int unlink(RGWFileHandle* parent, const char *name);
 
     /* find existing RGWFileHandle */
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 2702111..66ecd55 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1779,8 +1779,7 @@ int RGWCreateBucket::verify_permission()
     if (op_ret < 0)
       return op_ret;
 
-    map<string, RGWBucketEnt>& m = buckets.get_buckets();
-    if (m.size() >= s->user->max_buckets) {
+    if (buckets.count() >= s->user->max_buckets) {
       return -ERR_TOO_MANY_BUCKETS;
     }
   }
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 7177788..2bcc7b6 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -350,7 +350,7 @@ int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
   return RGWSystemMetaObj::read_default_id(default_id, old_format);
 }
 
-int RGWZoneGroup::set_as_default()
+int RGWZoneGroup::set_as_default(bool exclusive)
 {
   if (realm_id.empty()) {
     /* try using default realm */
@@ -363,7 +363,7 @@ int RGWZoneGroup::set_as_default()
     realm_id = realm.get_id();
   }
 
-  return RGWSystemMetaObj::set_as_default();
+  return RGWSystemMetaObj::set_as_default(exclusive);
 }
 
 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
@@ -441,7 +441,7 @@ int RGWSystemMetaObj::use_default(bool old_format)
   return read_default_id(id, old_format);
 }
 
-int RGWSystemMetaObj::set_as_default()
+int RGWSystemMetaObj::set_as_default(bool exclusive)
 {
   string pool_name = get_pool_name(cct);
   string oid  = get_default_oid();
@@ -454,7 +454,8 @@ int RGWSystemMetaObj::set_as_default()
 
   ::encode(default_info, bl);
 
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, real_time(), NULL);
+  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
+                               exclusive, NULL, real_time(), NULL);
   if (ret < 0)
     return ret;
 
@@ -688,22 +689,13 @@ const string& RGWRealm::get_predefined_name(CephContext *cct) {
 
 int RGWRealm::create(bool exclusive)
 {
-  list<string> realms;
-  int ret = store->list_realms(realms);
-  if (ret < 0 && ret != -ENOENT) {
-    ldout(cct, 0) << "ERROR: listing realms, ret=" << ret << dendl;
-    return ret;
-  }
-
-  bool first_realm = realms.empty();
-
-  ret = RGWSystemMetaObj::create(exclusive);
+  int ret = RGWSystemMetaObj::create(exclusive);
   if (ret < 0) {
     ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
     return ret;
   }
   // create the control object for watch/notify
-  ret = create_control();
+  ret = create_control(exclusive);
   if (ret < 0) {
     ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
     return ret;
@@ -733,12 +725,11 @@ int RGWRealm::create(bool exclusive)
     ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
     return ret;
   }
-
-  if (first_realm) { /* this is racy, but it's fine */
-    ret = set_as_default();
-    if (ret < 0) {
-      ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
-    }
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  ret = set_as_default(true);
+  if (ret < 0 && ret != -EEXIST) {
+    ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
   }
 
   return 0;
@@ -753,12 +744,12 @@ int RGWRealm::delete_obj()
   return delete_control();
 }
 
-int RGWRealm::create_control()
+int RGWRealm::create_control(bool exclusive)
 {
   auto pool_name = get_pool_name(cct);
   auto pool = rgw_bucket{pool_name.c_str()};
   auto oid = get_control_oid();
-  return rgw_put_system_obj(store, pool, oid, nullptr, 0, true,
+  return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
                             nullptr, real_time(), nullptr);
 }
 
@@ -1273,27 +1264,32 @@ int RGWPeriod::update_sync_status()
   return 0;
 }
 
-int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period)
+int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
+                      std::ostream& error_stream)
 {
   ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
   // gateway must be in the master zone to commit
   if (master_zone != store->get_zone_params().get_id()) {
-    ldout(cct, 0) << "period commit on zone " << store->get_zone_params().get_id()
-        << ", not period's master zone " << master_zone << dendl;
+    error_stream << "Cannot commit period on zone "
+        << store->get_zone_params().get_id() << ", it must be sent to "
+        "the period's master zone " << master_zone << '.' << std::endl;
     return -EINVAL;
   }
   // period predecessor must match current period
   if (predecessor_uuid != current_period.get_id()) {
-    ldout(cct, 0) << "period predecessor " << predecessor_uuid
+    error_stream << "Period predecessor " << predecessor_uuid
         << " does not match current period " << current_period.get_id()
-        << dendl;
+        << ". Use 'period pull' to get the latest period from the master, "
+        "reapply your changes, and try again." << std::endl;
     return -EINVAL;
   }
   // realm epoch must be 1 greater than current period
   if (realm_epoch != current_period.get_realm_epoch() + 1) {
-    ldout(cct, 0) << "period's realm epoch " << realm_epoch
+    error_stream << "Period's realm epoch " << realm_epoch
         << " does not come directly after current realm epoch "
-        << current_period.get_realm_epoch() << dendl;
+        << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
+        "latest realm and period from the master zone, reapply your changes, "
+        "and try again." << std::endl;
     return -EINVAL;
   }
   // did the master zone change?
@@ -1325,8 +1321,10 @@ int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period)
   }
   // period must be based on current epoch
   if (epoch != current_period.get_epoch()) {
-    ldout(cct, 0) << "period epoch " << epoch << " does not match "
-        "predecessor epoch " << current_period.get_epoch() << dendl;
+    error_stream << "Period epoch " << epoch << " does not match "
+        "predecessor epoch " << current_period.get_epoch()
+        << ". Use 'period pull' to get the latest epoch from the master zone, "
+        "reapply your changes, and try again." << std::endl;
     return -EINVAL;
   }
   // set period as next epoch
@@ -1478,15 +1476,9 @@ int RGWZoneParams::fix_pool_names()
 
 int RGWZoneParams::create(bool exclusive)
 {
-  list<string> zones;
-  int r = store->list_zones(zones);
-  if (r < 0) {
-    ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
-  }
-
   /* check for old pools config */
   rgw_obj obj(domain_root, avail_pools);
-  r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
+  int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
   if (r < 0) {
     ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
     /* a new system, let's set new placement info */
@@ -1509,11 +1501,11 @@ int RGWZoneParams::create(bool exclusive)
     return r;
   }
 
-  if (zones.empty()) { /* first zone? maybe, it's a racy check */
-    r = set_as_default();
-    if (r < 0) {
-      ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
-    }
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_as_default(true);
+  if (r < 0 && r != -EEXIST) {
+    ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
   }
 
   return 0;
@@ -1582,7 +1574,7 @@ int RGWZoneParams::read_default_id(string& default_id, bool old_format)
 }
 
 
-int RGWZoneParams::set_as_default()
+int RGWZoneParams::set_as_default(bool exclusive)
 {
   if (realm_id.empty()) {
     /* try using default realm */
@@ -1595,7 +1587,7 @@ int RGWZoneParams::set_as_default()
     realm_id = realm.get_id();
   }
 
-  return RGWSystemMetaObj::set_as_default();
+  return RGWSystemMetaObj::set_as_default(exclusive);
 }
 
 void RGWPeriodMap::encode(bufferlist& bl) const {
@@ -4068,6 +4060,10 @@ int RGWRados::list_buckets_next(RGWObjEnt& obj, RGWAccessHandle *handle)
     }
 
     obj.key.set((*state)->get_oid());
+    if (obj.key.name[0] == '_') {
+      obj.key.name = obj.key.name.substr(1);
+    }
+
     (*state)++;
   } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
 
@@ -6338,6 +6334,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
                const rgw_user& user_id,
                const string& client_id,
                const string& op_id,
+               bool record_op_state,
                req_info *info,
                const string& source_zone,
                rgw_obj& dest_obj,
@@ -6411,14 +6408,20 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
 
   string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_object();
 
-  RGWOpStateSingleOp opstate(this, client_id, op_id, obj_name);
+  RGWOpStateSingleOp *opstate = NULL;
 
-  ret = opstate.set_state(RGWOpState::OPSTATE_IN_PROGRESS);
-  if (ret < 0) {
-    ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
-    return ret;
+  if (record_op_state) {
+    opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
+
+    ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
+    if (ret < 0) {
+      ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
+      delete opstate;
+      return ret;
+    }
   }
-  RGWRadosPutObj cb(&processor, &opstate, progress_cb, progress_data);
+
+  RGWRadosPutObj cb(&processor, opstate, progress_cb, progress_data);
   string etag;
   map<string, string> req_headers;
   real_time set_mtime;
@@ -6547,21 +6550,31 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
     goto set_err_state;
   }
 
-  ret = opstate.set_state(RGWOpState::OPSTATE_COMPLETE);
-  if (ret < 0) {
-    ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
+  if (opstate) {
+    ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
+    if (ret < 0) {
+      ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
+    }
+    delete opstate;
   }
 
   return 0;
 set_err_state:
-  RGWOpState::OpState state = RGWOpState::OPSTATE_ERROR;
   if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
-    state = RGWOpState::OPSTATE_COMPLETE;
     ret = 0;
   }
-  int r = opstate.set_state(state);
-  if (r < 0) {
-    ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
+  if (opstate) {
+    RGWOpState::OpState state;
+    if (ret < 0) {
+      state = RGWOpState::OPSTATE_ERROR;
+    } else {
+      state = RGWOpState::OPSTATE_COMPLETE;
+    }
+    int r = opstate->set_state(state);
+    if (r < 0) {
+      ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
+    }
+    delete opstate;
   }
   return ret;
 }
@@ -6662,7 +6675,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_object() << " => " << dest_obj.bucket << ":" << dest_obj.get_object() << dendl;
 
   if (remote_src || !source_zone.empty()) {
-    return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, info, source_zone,
+    return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
                dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
                unmod_ptr, high_precision_time,
                if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
@@ -8574,7 +8587,20 @@ int RGWRados::Bucket::UpdateIndex::cancel()
     ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
     return ret;
   }
-  return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags);
+
+  ret = store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags);
+
+  /*
+   * need to update data log anyhow, so that whoever follows needs to update its internal markers
+   * for following the specific bucket shard log. Otherwise they end up staying behind, and users
+   * have no way to tell that they're all caught up
+   */
+  int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
+  if (r < 0) {
+    lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+  }
+
+  return ret;
 }
 
 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
@@ -9403,7 +9429,7 @@ void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation
   op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
 }
 
-int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, uint64_t olh_epoch)
+int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch)
 {
   rgw_rados_ref ref;
   rgw_bucket bucket;
@@ -9420,7 +9446,7 @@ int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string&
   }
 
   cls_rgw_obj_key key(obj_instance.get_index_key_name(), obj_instance.get_instance());
-  ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_epoch, get_zone().log_data);
+  ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_tag, olh_epoch, get_zone().log_data);
   if (ret < 0) {
     return ret;
   }
@@ -9760,7 +9786,9 @@ int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_i
       return ret;
     }
 
-    ret = bucket_index_unlink_instance(target_obj, op_tag, olh_epoch);
+    string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
+
+    ret = bucket_index_unlink_instance(target_obj, op_tag, olh_tag, olh_epoch);
     if (ret < 0) {
       ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " returned " << ret << dendl;
       if (ret == -ECANCELED) {
@@ -10047,6 +10075,30 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, int shard_id, string *bucket_
   return 0;
 }
 
+int RGWRados::get_bi_log_status(rgw_bucket& bucket, int shard_id,
+    map<int, string>& markers)
+{
+  map<string, rgw_bucket_dir_header> headers;
+  map<int, string> bucket_instance_ids;
+  int r = cls_bucket_head(bucket, shard_id, headers, &bucket_instance_ids);
+  if (r < 0)
+    return r;
+
+  assert(headers.size() == bucket_instance_ids.size());
+
+  map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
+  map<int, string>::iterator viter = bucket_instance_ids.begin();
+
+  for(; iter != headers.end(); ++iter, ++viter) {
+    if (shard_id >= 0) {
+      markers[shard_id] = iter->second.max_marker;
+    } else {
+      markers[viter->first] = iter->second.max_marker;
+    }
+  }
+  return 0;
+}
+
 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
   RGWGetBucketStats_CB *cb;
   uint32_t pendings;
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index b95c838..1195aa3 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -783,7 +783,7 @@ public:
   }
   int init(CephContext *_cct, RGWRados *_store, bool setup_obj = true, bool old_format = false);
   virtual int read_default_id(string& default_id, bool old_format = false);
-  virtual int set_as_default();
+  virtual int set_as_default(bool exclusive = false);
   int delete_default();
   virtual int create(bool exclusive = true);
   int delete_obj(bool old_format = false);
@@ -884,7 +884,7 @@ struct RGWZoneParams : RGWSystemMetaObj {
 	   bool old_format = false);
   using RGWSystemMetaObj::init;
   int read_default_id(string& default_id, bool old_format = false);
-  int set_as_default();
+  int set_as_default(bool exclusive = false) override;
   int create_default(bool old_format = false);
   int create(bool exclusive = true);
   int fix_pool_names();
@@ -1151,7 +1151,7 @@ struct RGWZoneGroup : public RGWSystemMetaObj {
   }
 
   int read_default_id(string& default_id, bool old_format = false);
-  int set_as_default();
+  int set_as_default(bool exclusive = false) override;
   int create_default(bool old_format = false);
   int equals(const string& other_zonegroup) const;
   int add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only, const list<string>& endpoints);
@@ -1301,7 +1301,7 @@ class RGWRealm : public RGWSystemMetaObj
   string current_period;
   epoch_t epoch{0}; //< realm epoch, incremented for each new period
 
-  int create_control();
+  int create_control(bool exclusive);
   int delete_control();
 public:
   RGWRealm() {}
@@ -1469,7 +1469,8 @@ public:
   int update();
 
   // commit a staging period; only for use on master zone
-  int commit(RGWRealm& realm, const RGWPeriod &current_period);
+  int commit(RGWRealm& realm, const RGWPeriod &current_period,
+             std::ostream& error_stream);
 
   void encode(bufferlist& bl) const {
     ENCODE_START(1, 1, bl);
@@ -2430,6 +2431,7 @@ public:
                        const rgw_user& user_id,
                        const string& client_id,
                        const string& op_id,
+                       bool record_op_state,
                        req_info *info,
                        const string& source_zone,
                        rgw_obj& dest_obj,
@@ -2646,7 +2648,7 @@ public:
                             const string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
                             uint64_t olh_epoch,
                             ceph::real_time unmod_since, bool high_precision_time);
-  int bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, uint64_t olh_epoch);
+  int bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch);
   int bucket_index_read_olh_log(RGWObjState& state, rgw_obj& obj_instance, uint64_t ver_marker,
                                 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
   int bucket_index_trim_olh_log(RGWObjState& obj_state, rgw_obj& obj_instance, uint64_t ver);
@@ -2749,6 +2751,7 @@ public:
   int cls_bucket_head_async(rgw_bucket& bucket, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
   int list_bi_log_entries(rgw_bucket& bucket, int shard_id, string& marker, uint32_t max, std::list<rgw_bi_log_entry>& result, bool *truncated);
   int trim_bi_log_entries(rgw_bucket& bucket, int shard_id, string& marker, string& end_marker);
+  int get_bi_log_status(rgw_bucket& bucket, int shard_id, map<int, string>& max_marker);
 
   int bi_get_instance(rgw_obj& obj, rgw_bucket_dir_entry *dirent);
   int bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc
index 718dffb..73cc129 100644
--- a/src/rgw/rgw_rest_client.cc
+++ b/src/rgw/rgw_rest_client.cc
@@ -23,6 +23,22 @@ int RGWRESTSimpleRequest::get_status()
   return status;
 }
 
+int RGWRESTSimpleRequest::handle_header(const string& name, const string& val) 
+{
+  if (name == "CONTENT_LENGTH") {
+    string err;
+    long len = strict_strtol(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldout(cct, 0) << "ERROR: failed converting content length (" << val << ") to int " << dendl;
+      return -EINVAL;
+    }
+
+    max_response = len;
+  }
+
+  return 0;
+}
+
 int RGWRESTSimpleRequest::receive_header(void *ptr, size_t len)
 {
   char line[len + 1];
@@ -144,10 +160,14 @@ int RGWRESTSimpleRequest::send_data(void *ptr, size_t len)
 
 int RGWRESTSimpleRequest::receive_data(void *ptr, size_t len)
 {
-  if (response.length() > max_response)
+  size_t cp_len, left_len;
+
+  left_len = max_response > response.length() ? (max_response - response.length()) : 0;
+  if (left_len == 0)
     return 0; /* don't read extra data */
 
-  bufferptr p((char *)ptr, len);
+  cp_len = (len > left_len) ? left_len : len;
+  bufferptr p((char *)ptr, cp_len);
 
   response.append(p);
 
@@ -672,12 +692,16 @@ int RGWRESTStreamRWRequest::get_resource(RGWAccessKey& key, map<string, string>&
 int RGWRESTStreamRWRequest::complete(string& etag, real_time *mtime, map<string, string>& attrs)
 {
   set_str_from_headers(out_headers, "ETAG", etag);
-  if (mtime) {
+  if (status > 0 && mtime) {
     string mtime_str;
     set_str_from_headers(out_headers, "RGWX_MTIME", mtime_str);
-    int ret = parse_rgwx_mtime(cct, mtime_str, mtime);
-    if (ret < 0) {
-      return ret;
+    if (!mtime_str.empty()) {
+      int ret = parse_rgwx_mtime(cct, mtime_str, mtime);
+      if (ret < 0) {
+        return ret;
+      }
+    } else {
+      *mtime = real_time();
     }
   }
 
diff --git a/src/rgw/rgw_rest_client.h b/src/rgw/rgw_rest_client.h
index 82f3c3c..7820dc9 100644
--- a/src/rgw/rgw_rest_client.h
+++ b/src/rgw/rgw_rest_client.h
@@ -25,7 +25,7 @@ protected:
   size_t max_response; /* we need this as we don't stream out response */
   bufferlist response;
 
-  virtual int handle_header(const string& name, const string& val) { return 0; }
+  virtual int handle_header(const string& name, const string& val);
   void append_param(string& dest, const string& name, const string& val);
   void get_params_str(map<string, string>& extra_args, string& dest);
 
diff --git a/src/rgw/rgw_rest_realm.cc b/src/rgw/rgw_rest_realm.cc
index 488c0cc..652735f 100644
--- a/src/rgw/rgw_rest_realm.cc
+++ b/src/rgw/rgw_rest_realm.cc
@@ -17,6 +17,7 @@ static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64;
 class RGWOp_Period_Base : public RGWRESTOp {
  protected:
   RGWPeriod period;
+  std::ostringstream error_stream;
  public:
   int verify_permission() override { return 0; }
   void send_response() override;
@@ -25,12 +26,19 @@ class RGWOp_Period_Base : public RGWRESTOp {
 // reply with the period object on success
 void RGWOp_Period_Base::send_response()
 {
+  s->err.message = error_stream.str();
+
   set_req_state_err(s, http_ret);
   dump_errno(s);
   end_header(s);
 
-  if (http_ret < 0)
+  if (http_ret < 0) {
+    if (!s->err.message.empty()) {
+      ldout(s->cct, 4) << "Request failed with " << http_ret
+          << ": " << s->err.message << dendl;
+    }
     return;
+  }
 
   encode_json("period", period, s->formatter);
   flusher.flush();
@@ -85,8 +93,8 @@ void RGWOp_Period_Post::execute()
 
   // require period.realm_id to match our realm
   if (period.get_realm() != store->realm.get_id()) {
-    lderr(cct) << "period with realm id " << period.get_realm()
-        << " doesn't match current realm " << store->realm.get_id() << dendl;
+    error_stream << "period with realm id " << period.get_realm()
+        << " doesn't match current realm " << store->realm.get_id() << std::endl;
     http_ret = -EINVAL;
     return;
   }
@@ -112,7 +120,7 @@ void RGWOp_Period_Post::execute()
 
   // if period id is empty, handle as 'period commit'
   if (period.get_id().empty()) {
-    http_ret = period.commit(realm, current_period);
+    http_ret = period.commit(realm, current_period, error_stream);
     if (http_ret < 0) {
       lderr(cct) << "master zone failed to commit period" << dendl;
     }
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 9c525d1..e4731ba 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -257,10 +257,14 @@ done:
 			riter->second.c_str());
   }
 
-  if (!content_type)
-    content_type = "binary/octet-stream";
+  if (op_ret == ERR_NOT_MODIFIED) {
+      end_header(s, this);
+  } else {
+      if (!content_type)
+          content_type = "binary/octet-stream";
 
-  end_header(s, this, content_type);
+      end_header(s, this, content_type);
+  }
 
   if (metadata_bl.length()) {
     STREAM_IO(s)->write(metadata_bl.c_str(), metadata_bl.length());
@@ -479,8 +483,10 @@ void RGWListBucket_ObjStore_S3::send_versioned_response()
 					    : "false"));
 
   bool encode_key = false;
-  if (strcasecmp(encoding_type.c_str(), "url") == 0)
+  if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+    s->formatter->dump_string("EncodingType", "url");
     encode_key = true;
+  }
 
   if (op_ret >= 0) {
     if (objs_container) {
@@ -576,8 +582,10 @@ void RGWListBucket_ObjStore_S3::send_response()
 					    : "false"));
 
   bool encode_key = false;
-  if (strcasecmp(encoding_type.c_str(), "url") == 0)
+  if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+    s->formatter->dump_string("EncodingType", "url");
     encode_key = true;
+  }
 
   if (op_ret >= 0) {
     vector<RGWObjEnt>::iterator iter;
@@ -2420,7 +2428,7 @@ void RGWListMultipart_ObjStore_S3::send_response()
       dump_time(s, "LastModified", &info.modified);
 
       s->formatter->dump_unsigned("PartNumber", info.num);
-      s->formatter->dump_string("ETag", info.etag);
+      s->formatter->dump_format("ETag", "\"%s\"", info.etag.c_str());
       s->formatter->dump_unsigned("Size", info.size);
       s->formatter->close_section();
     }
diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc
index bb3a1ac..38f92ed 100644
--- a/src/rgw/rgw_sync.cc
+++ b/src/rgw/rgw_sync.cc
@@ -712,7 +712,7 @@ public:
 int RGWReadSyncStatusCoroutine::handle_data(rgw_meta_sync_info& data)
 {
   if (retcode == -ENOENT) {
-    return retcode;
+    return 0;
   }
 
   RGWRados *store = sync_env->store;
@@ -1878,6 +1878,10 @@ int RGWRemoteMetaLog::run_sync()
   }
 
   do {
+    if (going_down.read()) {
+      ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
     r = run(new RGWReadSyncStatusCoroutine(&sync_env, obj_ctx, &sync_status));
     if (r < 0 && r != -ENOENT) {
       ldout(store->ctx(), 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
diff --git a/src/rgw/rgw_sync.h b/src/rgw/rgw_sync.h
index 66c639c..dcb3a4e 100644
--- a/src/rgw/rgw_sync.h
+++ b/src/rgw/rgw_sync.h
@@ -5,6 +5,7 @@
 #include "rgw_http_client.h"
 #include "rgw_meta_sync_status.h"
 
+#include "include/stringify.h"
 #include "common/RWLock.h"
 
 #define ERROR_LOGGER_SHARDS 32
@@ -142,7 +143,7 @@ protected:
   }
 
 public:
-  RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error) : RGWCoroutine(_cct), cr(NULL), lock("RGWBackoffControlCR::lock"),
+  RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error) : RGWCoroutine(_cct), cr(NULL), lock("RGWBackoffControlCR::lock:" + stringify(this)),
                                                                 reset_backoff(false), exit_on_error(_exit_on_error) {
   }
 
@@ -294,6 +295,7 @@ class RGWSyncShardMarkerTrack {
   typename std::map<T, marker_entry> pending;
 
   T high_marker;
+  T last_stored_marker;
   marker_entry high_entry;
 
   int window_size;
@@ -355,14 +357,19 @@ public:
     updates_since_flush++;
 
     if (is_first && (updates_since_flush >= window_size || pending.empty())) {
-      return update_marker(high_marker, high_entry);
+      return flush();
     }
     return NULL;
   }
 
-  RGWCoroutine *update_marker(const T& new_marker, marker_entry& entry) {
+  RGWCoroutine *flush() {
+    if (last_stored_marker == high_marker) {
+      return NULL;
+    }
+
     updates_since_flush = 0;
-    return store_marker(new_marker, entry.pos, entry.timestamp);
+    last_stored_marker = high_marker;
+    return store_marker(high_marker, high_entry.pos, high_entry.timestamp);
   }
 
   /*
diff --git a/src/script/subman b/src/script/subman
new file mode 100755
index 0000000..129e507
--- /dev/null
+++ b/src/script/subman
@@ -0,0 +1,20 @@
+#!/usr/bin/env python -B
+
+import json
+import re
+import subprocess
+
+disks = json.loads(subprocess.check_output("ceph-disk list --format json", shell=True))
+used = 0
+
+for disk in disks:
+    for partition in disk.get('partition', []):
+        if partition.get('type') == 'data':
+            df = subprocess.check_output("df --output=used " + partition['path'], shell=True)
+            used += int(re.findall('\d+', df)[0])
+
+open("/etc/rhsm/facts/ceph_usage.facts", 'w').write("""
+{
+"band.storage.usage": {used}
+}
+""".format(used=used/(1024*1024*1024)))
diff --git a/src/stop.sh b/src/stop.sh
index 795ac9a..e7f62ca 100755
--- a/src/stop.sh
+++ b/src/stop.sh
@@ -21,7 +21,7 @@ test -d dev/osd0/. && test -e dev/sudo && SUDO="sudo"
 if [ -e CMakeCache.txt ]; then
   [ -z "$CEPH_BIN" ] && CEPH_BIN=src
 else
-  [ -z "$CEPH_BIN" ] && CEPH_BIN=.
+  [ -z "$CEPH_BIN" ] && CEPH_BIN=bin
 fi
 
 MYUID=$(id -u)
diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am
index b2ad5e8..243c2b0 100644
--- a/src/test/Makefile-client.am
+++ b/src/test/Makefile-client.am
@@ -234,7 +234,7 @@ ceph_test_rados_api_aio_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_rados_api_aio
 
 ceph_test_rados_api_list_SOURCES = test/librados/list.cc
-ceph_test_rados_api_list_LDADD = $(LIBRADOS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
+ceph_test_rados_api_list_LDADD = $(LIBRADOS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_rados_api_list_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_rados_api_list
 
@@ -362,17 +362,24 @@ librbd_test_la_SOURCES = \
 	test/librbd/test_ImageWatcher.cc \
 	test/librbd/test_internal.cc \
 	test/librbd/test_mirroring.cc \
+	test/librbd/test_MirroringWatcher.cc \
 	test/librbd/test_ObjectMap.cc \
 	test/librbd/journal/test_Entries.cc \
 	test/librbd/journal/test_Replay.cc
 librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 noinst_LTLIBRARIES += librbd_test.la
 
+librbd_test_mock_la_SOURCES = \
+	test/librbd/mock/MockImageCtx.cc
+librbd_test_mock_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+noinst_LTLIBRARIES += librbd_test_mock.la
+
 unittest_librbd_SOURCES = \
         test/librbd/test_main.cc \
 	test/librbd/test_mock_fixture.cc \
 	test/librbd/test_mock_ExclusiveLock.cc \
 	test/librbd/test_mock_Journal.cc \
+	test/librbd/test_mock_ObjectWatcher.cc \
 	test/librbd/exclusive_lock/test_mock_AcquireRequest.cc \
 	test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc \
 	test/librbd/image/test_mock_RefreshRequest.cc \
@@ -394,7 +401,8 @@ unittest_librbd_SOURCES = \
 	test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
 unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 unittest_librbd_LDADD = \
-	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
+	librbd_test.la librbd_test_mock.la \
+	librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
 	libcls_rbd_client.la libcls_lock_client.la \
 	libjournal.la libcls_journal_client.la \
 	librados_test_stub.la librados_internal.la \
@@ -434,6 +442,7 @@ noinst_HEADERS += \
 	test/librbd/mock/MockImageState.h \
 	test/librbd/mock/MockImageWatcher.h \
 	test/librbd/mock/MockJournal.h \
+	test/librbd/mock/MockJournalPolicy.h \
 	test/librbd/mock/MockObjectMap.h \
 	test/librbd/mock/MockOperations.h \
 	test/librbd/mock/MockReadahead.h \
@@ -457,6 +466,7 @@ noinst_LTLIBRARIES += librbd_mirror_test.la
 unittest_rbd_mirror_SOURCES = \
 	test/rbd_mirror/test_main.cc \
 	test/rbd_mirror/test_mock_fixture.cc \
+	test/rbd_mirror/test_mock_ImageReplayer.cc \
 	test/rbd_mirror/test_mock_ImageSync.cc \
 	test/rbd_mirror/image_sync/test_mock_ImageCopyRequest.cc \
 	test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc \
@@ -467,6 +477,7 @@ unittest_rbd_mirror_SOURCES = \
 unittest_rbd_mirror_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_rbd_mirror_LDADD = \
 	librbd_mirror_test.la \
+	librbd_test_mock.la \
 	librados_test_stub.la \
 	librbd_mirror_internal.la \
 	librbd_internal.la \
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 5e7fc85..536edbf 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -81,6 +81,7 @@ check_SCRIPTS += \
 	test/mon/osd-erasure-code-profile.sh \
 	test/mon/mkfs.sh \
 	test/mon/mon-scrub.sh \
+       test/mon/test_pool_quota.sh \
 	test/osd/osd-scrub-repair.sh \
 	test/osd/osd-scrub-snaps.sh \
 	test/osd/osd-config.sh \
diff --git a/src/test/centos-6/ceph.spec.in b/src/test/centos-6/ceph.spec.in
index b52d7e2..3a5a6f7 100644
--- a/src/test/centos-6/ceph.spec.in
+++ b/src/test/centos-6/ceph.spec.in
@@ -27,6 +27,10 @@
 %bcond_with selinux
 %endif
 
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
 
 %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
@@ -62,11 +66,6 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 %{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
 # unify libexec for all targets
 %global _libexecdir %{_exec_prefix}/lib
 
@@ -186,7 +185,7 @@ BuildRequires:  boost-random
 BuildRequires:	python-argparse
 %endif
 # lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:	lttng-ust-devel
 BuildRequires:	libbabeltrace-devel
@@ -685,6 +684,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
                 --libexecdir=%{_libexecdir} \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?rhel} && ! 0%{?centos}
+                --enable-subman \
+%endif
 %if 0%{?_with_systemd}
 		--with-systemdsystemunitdir=%_unitdir \
 %endif
@@ -702,6 +704,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 %endif
 		--with-librocksdb-static=check \
 		--with-radosgw \
+%if %{without lttng}
+		--without-lttng \
+		--without-babeltrace \
+%endif
 		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
 		%{?_with_tcmalloc} \
@@ -858,7 +864,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
@@ -977,7 +983,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
 %{_bindir}/rbdmap
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_bindir}/rbd-replay-prep
 %endif
 %{_bindir}/ceph-post-file
@@ -994,6 +1000,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
 %{_mandir}/man8/rbd-replay-many.8*
 %{_mandir}/man8/rbd-replay-prep.8*
@@ -1017,19 +1024,22 @@ rm -rf $RPM_BUILD_ROOT
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
 
 %pre common
-CEPH_GROUP_ID=""
-CEPH_USER_ID=""
+CEPH_GROUP_ID=167
+CEPH_USER_ID=167
 %if 0%{?rhel} || 0%{?fedora}
-CEPH_GROUP_ID="-g 167"
-CEPH_USER_ID="-u 167"
-%endif
-%if 0%{?rhel} || 0%{?fedora}
-%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
-%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%{_sbindir}/groupadd ceph -g $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph -u $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 %if 0%{?suse_version}
-getent group ceph >/dev/null || groupadd -r ceph
-getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+if ! getent group ceph >/dev/null ; then
+    CEPH_GROUP_ID_OPTION=""
+    getent group $CEPH_GROUP_ID >/dev/null || CEPH_GROUP_ID_OPTION="-g $CEPH_GROUP_ID"
+    groupadd ceph $CEPH_GROUP_ID_OPTION -r 2>/dev/null || :
+fi
+if ! getent passwd ceph >/dev/null ; then
+    CEPH_USER_ID_OPTION=""
+    getent passwd $CEPH_USER_ID >/dev/null || CEPH_USER_ID_OPTION="-u $CEPH_USER_ID"
+    useradd ceph $CEPH_USER_ID_OPTION -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 exit 0
 
@@ -1182,6 +1192,9 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-osd.8*
+%if 0%{?rhel} && ! 0%{?centos}
+/etc/cron.hourly/subman
+%endif
 %if 0%{?_with_systemd}
 %{_unitdir}/ceph-osd at .service
 %{_unitdir}/ceph-osd.target
@@ -1220,7 +1233,7 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so.*
 %endif
 
@@ -1244,7 +1257,7 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so
 %endif
 %{_bindir}/librados-config
@@ -1279,7 +1292,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so.*
 %endif
 
@@ -1299,7 +1312,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so
 %endif
 
diff --git a/src/test/centos-7/ceph.spec.in b/src/test/centos-7/ceph.spec.in
index b52d7e2..3a5a6f7 100644
--- a/src/test/centos-7/ceph.spec.in
+++ b/src/test/centos-7/ceph.spec.in
@@ -27,6 +27,10 @@
 %bcond_with selinux
 %endif
 
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
 
 %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
@@ -62,11 +66,6 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 %{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
 # unify libexec for all targets
 %global _libexecdir %{_exec_prefix}/lib
 
@@ -186,7 +185,7 @@ BuildRequires:  boost-random
 BuildRequires:	python-argparse
 %endif
 # lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:	lttng-ust-devel
 BuildRequires:	libbabeltrace-devel
@@ -685,6 +684,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
                 --libexecdir=%{_libexecdir} \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?rhel} && ! 0%{?centos}
+                --enable-subman \
+%endif
 %if 0%{?_with_systemd}
 		--with-systemdsystemunitdir=%_unitdir \
 %endif
@@ -702,6 +704,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 %endif
 		--with-librocksdb-static=check \
 		--with-radosgw \
+%if %{without lttng}
+		--without-lttng \
+		--without-babeltrace \
+%endif
 		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
 		%{?_with_tcmalloc} \
@@ -858,7 +864,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
@@ -977,7 +983,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
 %{_bindir}/rbdmap
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_bindir}/rbd-replay-prep
 %endif
 %{_bindir}/ceph-post-file
@@ -994,6 +1000,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
 %{_mandir}/man8/rbd-replay-many.8*
 %{_mandir}/man8/rbd-replay-prep.8*
@@ -1017,19 +1024,22 @@ rm -rf $RPM_BUILD_ROOT
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
 
 %pre common
-CEPH_GROUP_ID=""
-CEPH_USER_ID=""
+CEPH_GROUP_ID=167
+CEPH_USER_ID=167
 %if 0%{?rhel} || 0%{?fedora}
-CEPH_GROUP_ID="-g 167"
-CEPH_USER_ID="-u 167"
-%endif
-%if 0%{?rhel} || 0%{?fedora}
-%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
-%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%{_sbindir}/groupadd ceph -g $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph -u $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 %if 0%{?suse_version}
-getent group ceph >/dev/null || groupadd -r ceph
-getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+if ! getent group ceph >/dev/null ; then
+    CEPH_GROUP_ID_OPTION=""
+    getent group $CEPH_GROUP_ID >/dev/null || CEPH_GROUP_ID_OPTION="-g $CEPH_GROUP_ID"
+    groupadd ceph $CEPH_GROUP_ID_OPTION -r 2>/dev/null || :
+fi
+if ! getent passwd ceph >/dev/null ; then
+    CEPH_USER_ID_OPTION=""
+    getent passwd $CEPH_USER_ID >/dev/null || CEPH_USER_ID_OPTION="-u $CEPH_USER_ID"
+    useradd ceph $CEPH_USER_ID_OPTION -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 exit 0
 
@@ -1182,6 +1192,9 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-osd.8*
+%if 0%{?rhel} && ! 0%{?centos}
+/etc/cron.hourly/subman
+%endif
 %if 0%{?_with_systemd}
 %{_unitdir}/ceph-osd at .service
 %{_unitdir}/ceph-osd.target
@@ -1220,7 +1233,7 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so.*
 %endif
 
@@ -1244,7 +1257,7 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so
 %endif
 %{_bindir}/librados-config
@@ -1279,7 +1292,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so.*
 %endif
 
@@ -1299,7 +1312,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so
 %endif
 
diff --git a/src/test/ceph_objectstore_tool.py b/src/test/ceph_objectstore_tool.py
index 20a5d30..81af00a 100755
--- a/src/test/ceph_objectstore_tool.py
+++ b/src/test/ceph_objectstore_tool.py
@@ -44,9 +44,10 @@ def wait_for_health():
     print "Wait for health_ok...",
     tries = 0
     while call("./ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null", shell=True) == 0:
-        if ++tries == 30:
+        tries += 1
+        if tries == 150:
             raise Exception("Time exceeded to go to health")
-        time.sleep(5)
+        time.sleep(1)
     print "DONE"
 
 
@@ -400,6 +401,9 @@ def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
             logging.error("Can't find imported object {name}".format(name=file))
             ERRORS += 1
         for obj_loc in obj_locs:
+            # For btrfs skip snap_* dirs
+            if re.search("/snap_[0-9]*/", obj_loc) is not None:
+                continue
             repcount += 1
             cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
             logging.debug(cmd)
@@ -1796,16 +1800,12 @@ def main(argv):
         vstart(new=False)
         wait_for_health()
 
-        time.sleep(20)
-
         cmd = "./ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL)
         logging.debug(cmd)
         ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
         time.sleep(5)
         wait_for_health()
 
-        time.sleep(15)
-
         kill_daemons()
 
         # Now 2 PGs, poolid.0 and poolid.1
diff --git a/src/test/cli/crushtool/check-invalid-map.t b/src/test/cli/crushtool/check-invalid-map.t
new file mode 100644
index 0000000..d4b6b06
--- /dev/null
+++ b/src/test/cli/crushtool/check-invalid-map.t
@@ -0,0 +1,3 @@
+  $ crushtool -d /etc/hosts
+  crushtool: unable to decode /etc/hosts
+  [1]
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 587317a..5867f00 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -60,6 +60,7 @@
     zonegroup-map get          show zonegroup-map
     zonegroup-map set          set zonegroup-map (requires infile)
     zone create                create a new zone
+    zone delete                delete a zone
     zone get                   show zone cluster params
     zone modify                set/clear zone master status
     zone set                   set zone cluster params (requires infile)
@@ -102,7 +103,7 @@
     replicalog get             get replica metadata log entry
     replicalog update          update replica metadata log entry
     replicalog delete          delete replica metadata log entry
-    orphans find               init and run search for leaked rados objects
+    orphans find               init and run search for leaked rados objects (use job-id, pool)
     orphans finish             clean up search for leaked rados objects
   options:
      --tenant=<tenant>         tenant name
@@ -135,7 +136,6 @@
                                  replica datalog get/delete
      --metadata-key=<key>      key to retrieve metadata from with metadata get
      --remote=<remote>         remote to pull period
-     --parent=<id>             parent period id
      --period=<id>             period id
      --epoch=<number>          period epoch
      --commit                  commit the period during 'period update'
@@ -148,8 +148,11 @@
      --realm-new-name=<realm new name> realm new name
      --rgw-zonegroup=<zonegroup>   zonegroup name
      --rgw-zone=<zone>         zone in which radosgw is running
+     --zone-id=<zone id>       zone id
      --zone-new-name=<zone>    zone new name
+     --source-zone             specify the source zone (for data sync)
      --default                 set entity (realm, zonegroup, zone) as default
+     --read-only               set zone as read-only (when adding to zonegroup)
      --endpoints=<list>        zone endpoints
      --fix                     besides checking bucket index, will also fix it
      --check-objects           bucket check: rebuilds bucket index according to
@@ -188,6 +191,8 @@
   Orphans search options:
      --pool                    data pool to scan for leaked rados objects in
      --num-shards              num of shards to use for keeping the temporary scan info
+     --job-id                  set the job id (for orphans find)
+     --max-concurrent-ios      maximum concurrent ios for orphans find (default: 32)
   
     --conf/-c FILE    read configuration from the given configuration file
     --id/-i ID        set ID portion of my name
@@ -198,3 +203,5 @@
     --version         show version and quit
   
   [1]
+
+
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index e68f186..f68a486 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -42,7 +42,8 @@
       lock remove (lock rm)       Release a lock on an image.
       map                         Map image to a block device using the kernel.
       merge-diff                  Merge two diff exports together.
-      mirror image demote         Demote an image to secondary for RBD mirroring.
+      mirror image demote         Demote an image to non-primary for RBD
+                                  mirroring.
       mirror image disable        Disable RBD mirroring for an image.
       mirror image enable         Enable RBD mirroring for an image.
       mirror image promote        Promote an image to primary for RBD mirroring.
@@ -755,7 +756,7 @@
   usage: rbd mirror image demote [--pool <pool>] [--image <image>] 
                                  <image-spec> 
   
-  Demote an image to secondary for RBD mirroring.
+  Demote an image to non-primary for RBD mirroring.
   
   Positional arguments
     <image-spec>         image specification
diff --git a/src/test/cls_journal/test_cls_journal.cc b/src/test/cls_journal/test_cls_journal.cc
index 9c6000d..2e11236 100644
--- a/src/test/cls_journal/test_cls_journal.cc
+++ b/src/test/cls_journal/test_cls_journal.cc
@@ -197,6 +197,7 @@ TEST_F(TestClsJournal, GetClient) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   Client client;
   ASSERT_EQ(-ENOENT, client::get_client(ioctx, oid, "id", &client));
@@ -215,6 +216,7 @@ TEST_F(TestClsJournal, ClientRegister) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", bufferlist()));
 
@@ -230,6 +232,7 @@ TEST_F(TestClsJournal, ClientRegisterDuplicate) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", bufferlist()));
   ASSERT_EQ(-EEXIST, client::client_register(ioctx, oid, "id1", bufferlist()));
@@ -240,6 +243,7 @@ TEST_F(TestClsJournal, ClientUpdateData) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   ASSERT_EQ(-ENOENT, client::client_update_data(ioctx, oid, "id1",
                                                 bufferlist()));
@@ -261,6 +265,7 @@ TEST_F(TestClsJournal, ClientUpdateState) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   ASSERT_EQ(-ENOENT, client::client_update_state(ioctx, oid, "id1",
                                                  CLIENT_STATE_DISCONNECTED));
@@ -285,6 +290,7 @@ TEST_F(TestClsJournal, ClientUnregister) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", bufferlist()));
   ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id1"));
@@ -295,6 +301,7 @@ TEST_F(TestClsJournal, ClientUnregisterDNE) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
 
   std::string oid = get_temp_image_name();
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
 
   ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", bufferlist()));
   ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id1"));
diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc
index cd8283e..64f5b54 100644
--- a/src/test/cls_rbd/test_cls_rbd.cc
+++ b/src/test/cls_rbd/test_cls_rbd.cc
@@ -1373,8 +1373,8 @@ TEST_F(TestClsRbd, mirror_image) {
   ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
   ioctx.remove(RBD_MIRRORING);
 
-  vector<string> image_ids;
-  ASSERT_EQ(-ENOENT, mirror_image_list(&ioctx, &image_ids));
+  std::map<std::string, std::string> mirror_image_ids;
+  ASSERT_EQ(-ENOENT, mirror_image_list(&ioctx, "", 0, &mirror_image_ids));
 
   cls::rbd::MirrorImage image1("uuid1", cls::rbd::MIRROR_IMAGE_STATE_ENABLED);
   cls::rbd::MirrorImage image2("uuid2", cls::rbd::MIRROR_IMAGE_STATE_DISABLING);
@@ -1382,10 +1382,14 @@ TEST_F(TestClsRbd, mirror_image) {
 
   ASSERT_EQ(0, mirror_image_set(&ioctx, "image_id1", image1));
   ASSERT_EQ(0, mirror_image_set(&ioctx, "image_id2", image2));
-  ASSERT_EQ(-EEXIST, mirror_image_set(&ioctx, "image_id1", image2));
-  ASSERT_EQ(-EEXIST, mirror_image_set(&ioctx, "image_id2", image3));
+  ASSERT_EQ(-EINVAL, mirror_image_set(&ioctx, "image_id1", image2));
+  ASSERT_EQ(-EEXIST, mirror_image_set(&ioctx, "image_id3", image2));
   ASSERT_EQ(0, mirror_image_set(&ioctx, "image_id3", image3));
 
+  std::string image_id;
+  ASSERT_EQ(0, mirror_image_get_image_id(&ioctx, "uuid2", &image_id));
+  ASSERT_EQ("image_id2", image_id);
+
   cls::rbd::MirrorImage read_image;
   ASSERT_EQ(0, mirror_image_get(&ioctx, "image_id1", &read_image));
   ASSERT_EQ(read_image, image1);
@@ -1394,17 +1398,22 @@ TEST_F(TestClsRbd, mirror_image) {
   ASSERT_EQ(0, mirror_image_get(&ioctx, "image_id3", &read_image));
   ASSERT_EQ(read_image, image3);
 
-  ASSERT_EQ(0, mirror_image_list(&ioctx, &image_ids));
-  vector<string> expected_image_ids = {
-    {"image_id1"}, {"image_id2"}, {"image_id3"}};
-  ASSERT_EQ(expected_image_ids, image_ids);
+  ASSERT_EQ(0, mirror_image_list(&ioctx, "", 1, &mirror_image_ids));
+  std::map<std::string, std::string> expected_mirror_image_ids = {
+    {"image_id1", "uuid1"}};
+  ASSERT_EQ(expected_mirror_image_ids, mirror_image_ids);
+
+  ASSERT_EQ(0, mirror_image_list(&ioctx, "image_id1", 2, &mirror_image_ids));
+  expected_mirror_image_ids = {{"image_id2", "uuid2"}, {"image_id3", "uuid3"}};
+  ASSERT_EQ(expected_mirror_image_ids, mirror_image_ids);
 
   ASSERT_EQ(0, mirror_image_remove(&ioctx, "image_id2"));
+  ASSERT_EQ(-ENOENT, mirror_image_get_image_id(&ioctx, "uuid2", &image_id));
   ASSERT_EQ(-EBUSY, mirror_image_remove(&ioctx, "image_id1"));
 
-  ASSERT_EQ(0, mirror_image_list(&ioctx, &image_ids));
-  expected_image_ids = {{"image_id1"}, {"image_id3"}};
-  ASSERT_EQ(expected_image_ids, image_ids);
+  ASSERT_EQ(0, mirror_image_list(&ioctx, "", 3, &mirror_image_ids));
+  expected_mirror_image_ids = {{"image_id1", "uuid1"}, {"image_id3", "uuid3"}};
+  ASSERT_EQ(expected_mirror_image_ids, mirror_image_ids);
 
   image1.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
   image3.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
@@ -1415,7 +1424,7 @@ TEST_F(TestClsRbd, mirror_image) {
   ASSERT_EQ(0, mirror_image_remove(&ioctx, "image_id1"));
   ASSERT_EQ(0, mirror_image_remove(&ioctx, "image_id3"));
 
-  ASSERT_EQ(0, mirror_image_list(&ioctx, &image_ids));
-  expected_image_ids = {};
-  ASSERT_EQ(expected_image_ids, image_ids);
+  ASSERT_EQ(0, mirror_image_list(&ioctx, "", 3, &mirror_image_ids));
+  expected_mirror_image_ids = {};
+  ASSERT_EQ(expected_mirror_image_ids, mirror_image_ids);
 }
diff --git a/src/test/encoding/check-generated.sh b/src/test/encoding/check-generated.sh
index 7c33e54..ee55fab 100755
--- a/src/test/encoding/check-generated.sh
+++ b/src/test/encoding/check-generated.sh
@@ -1,5 +1,7 @@
 #!/bin/sh -e
 
+source ../qa/workunits/ceph-helpers.sh
+
 dir=$1
 
 set -e
@@ -17,23 +19,35 @@ for type in `./ceph-dencoder list_types`; do
     num=`./ceph-dencoder type $type count_tests`
     echo "$num $type"
     for n in `seq 1 1 $num 2>/dev/null`; do
-	if ! ./ceph-dencoder type $type select_test $n encode decode; then
+	safe_type=$type
+	# BitVector<2> needs some escaping to avoid bash issues with <>
+	if [ "$type" = "BitVector<2>" ]; then
+	    safe_type="BitVector\<2\>"
+	fi
+
+	pids=""
+	run_in_background pids bash -c "./ceph-dencoder type $safe_type select_test $n dump_json > $tmp1"
+	run_in_background pids bash -c "./ceph-dencoder type $safe_type select_test $n encode decode dump_json > $tmp2"
+	run_in_background pids bash -c "./ceph-dencoder type $safe_type select_test $n copy dump_json > $tmp3"
+	run_in_background pids bash -c "./ceph-dencoder type $safe_type select_test $n copy_ctor dump_json > $tmp4"
+	wait_background pids
+
+	if [ $? -ne 0 ]; then
 	    echo "**** $type test $n encode+decode check failed ****"
 	    echo "   ceph-dencoder type $type select_test $n encode decode"
 	    failed=$(($failed + 3))
 	    continue
 	fi
 
-	./ceph-dencoder type $type select_test $n dump_json > $tmp1
-	./ceph-dencoder type $type select_test $n encode decode dump_json > $tmp2
-	./ceph-dencoder type $type select_test $n copy dump_json > $tmp3
-	./ceph-dencoder type $type select_test $n copy_ctor dump_json > $tmp4
-
 	# nondeterministic classes may dump nondeterministically.  compare
 	# the sorted json output.  this is a weaker test, but is better
 	# than nothing.
-	if ! ./ceph-dencoder type $type is_deterministic
-	then
+	deterministic=0
+	if ./ceph-dencoder type $type is_deterministic; then
+	    deterministic=1
+	fi
+
+	if [ $deterministic -eq 0 ]; then
 	    echo "  sorting json output for nondeterministic object"
 	    for f in $tmp1 $tmp2 $tmp3 $tmp4; do
 		sort $f | sed 's/,$//' > $f.new
@@ -65,10 +79,11 @@ for type in `./ceph-dencoder list_types`; do
 	    failed=$(($failed + 1))
 	fi
 
-	if ./ceph-dencoder type $type is_deterministic
-	then
-	    ./ceph-dencoder type $type select_test $n encode export $tmp1
-	    ./ceph-dencoder type $type select_test $n encode decode encode export $tmp2
+	if [ $deterministic -ne 0 ]; then
+	    run_in_background pids bash -c "./ceph-dencoder type $safe_type select_test $n encode export $tmp1"
+	    run_in_background pids bash -c "./ceph-dencoder type $safe_type select_test $n encode decode encode export $tmp2"
+	    wait_background pids
+
 	    if ! cmp $tmp1 $tmp2; then
 		echo "**** $type test $n binary reencode check failed ****"
 		echo "   ./ceph-dencoder type $type select_test $n encode export $tmp1"
@@ -78,7 +93,6 @@ for type in `./ceph-dencoder list_types`; do
 	    fi
 	fi
 
-
 	numtests=$(($numtests + 3))
     done
 done
diff --git a/src/test/encoding/readable.sh b/src/test/encoding/readable.sh
index 2116f45..42cacb4 100755
--- a/src/test/encoding/readable.sh
+++ b/src/test/encoding/readable.sh
@@ -4,23 +4,25 @@ dir=../ceph-object-corpus
 
 set -e
 
-tmp1=`mktemp /tmp/typ-XXXXXXXXX`
-tmp2=`mktemp /tmp/typ-XXXXXXXXX`
-
 failed=0
 numtests=0
+pids=""
 
 myversion=`./ceph-dencoder version`
+DEBUG=0
+WAITALL_DELAY=.1
+debug() { if [ "$DEBUG" -gt 0 ]; then echo "DEBUG: $*" >&2; fi }
 
-for arversion in `ls -v $dir/archive`; do
-  vdir="$dir/archive/$arversion"
-  #echo $vdir
+test_object() {
+    local type=$1
+    local output_file=$2
+    local failed=0
+    local numtests=0
 
-  if [ ! -d "$vdir/objects" ]; then
-    continue;
-  fi
+    tmp1=`mktemp /tmp/typ-XXXXXXXXX`
+    tmp2=`mktemp /tmp/typ-XXXXXXXXX`
 
-  for type in `ls $vdir/objects`; do
+    rm -f $output_file
     if ./ceph-dencoder type $type 2>/dev/null; then
       #echo "type $type";
       echo "        $vdir/objects/$type"
@@ -84,15 +86,21 @@ for arversion in `ls -v $dir/archive`; do
           continue
         fi;
 
+        ./ceph-dencoder type $type import $vdir/objects/$type/$f decode dump_json > $tmp1 &
+        pid1="$!"
+        ./ceph-dencoder type $type import $vdir/objects/$type/$f decode encode decode dump_json > $tmp2 &
+        pid2="$!"
         #echo "\t$vdir/$type/$f"
-        if ! ./ceph-dencoder type $type import $vdir/objects/$type/$f decode dump_json > $tmp1; then
+        if ! wait $pid1; then
           echo "**** failed to decode $vdir/objects/$type/$f ****"
           failed=$(($failed + 1))
+          rm -f $tmp1 $tmp2
           continue      
         fi
-        if ! ./ceph-dencoder type $type import $vdir/objects/$type/$f decode encode decode dump_json > $tmp2; then
+        if ! wait $pid2; then
           echo "**** failed to decode+encode+decode $vdir/objects/$type/$f ****"
           failed=$(($failed + 1))
+          rm -f $tmp1 $tmp2
           continue
         fi
 
@@ -114,15 +122,86 @@ for arversion in `ls -v $dir/archive`; do
           failed=$(($failed + 1))
         fi
         numtests=$(($numtests + 1))
+        echo "failed=$failed" > $output_file
+        echo "numtests=$numtests" >> $output_file
       done
     else
       echo "skipping unrecognized type $type"
     fi
+
+    rm -f $tmp1 $tmp2
+}
+
+waitall() { # PID...
+   ## Wait for children to exit and indicate whether all exited with 0 status.
+   local errors=0
+   while :; do
+     debug "Processes remaining: $*"
+     for pid in "$@"; do
+       shift
+       if kill -0 "$pid" 2>/dev/null; then
+         debug "$pid is still alive."
+         set -- "$@" "$pid"
+       elif wait "$pid"; then
+         debug "$pid exited with zero exit status."
+       else
+         debug "$pid exited with non-zero exit status."
+         errors=$(($errors + 1))
+       fi
+     done
+     (("$#" > 0)) || break
+     sleep ${WAITALL_DELAY:-1}
+    done
+   [ $errors -eq 0 ]
+}
+
+######
+# MAIN
+######
+
+# Using $MAX_PARALLEL_JOBS jobs if defined, unless the number of logical
+# processors
+max_parallel_jobs=${MAX_PARALLEL_JOBS:-$(nproc)}
+
+for arversion in `ls -v $dir/archive`; do
+  vdir="$dir/archive/$arversion"
+  #echo $vdir
+
+  if [ ! -d "$vdir/objects" ]; then
+    continue;
+  fi
+
+  output_file=`mktemp /tmp/typ-XXXXXXXXX`
+  running_jobs=0
+  for type in `ls $vdir/objects`; do
+    test_object $type $output_file.$running_jobs &
+    pids="$pids $!"
+    running_jobs=$(($running_jobs + 1))
+
+    # Once we spawned enough jobs, let's wait them to complete
+    # Every spawned job have almost the same execution time so
+    # it's not a big deal having them not ending at the same time
+    if [ "$running_jobs" -eq "$max_parallel_jobs" ]; then
+        waitall $pids
+        pids=""
+        # Reading the output of jobs to compute failed & numtests
+        # Tests are run in parallel but sum should be done sequentialy to avoid
+        # races between threads
+        while [ "$running_jobs" -ge 0 ]; do
+            if [ -f $output_file.$running_jobs ]; then
+                read_failed=$(grep "^failed=" $output_file.$running_jobs | cut -d "=" -f 2)
+                read_numtests=$(grep "^numtests=" $output_file.$running_jobs | cut -d "=" -f 2)
+                rm -f $output_file.$running_jobs
+                failed=$(($failed + $read_failed))
+                numtests=$(($numtests + $read_numtests))
+            fi
+            running_jobs=$(($running_jobs - 1))
+        done
+        running_jobs=0
+    fi
   done
 done
 
-rm -f $tmp1 $tmp2
-
 if [ $failed -gt 0 ]; then
   echo "FAILED $failed / $numtests tests."
   exit 1
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 382c0a3..11dfc50 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -245,6 +245,8 @@ TYPE_FEATUREFUL(EUpdate)
 TYPE(librbd::journal::EventEntry)
 TYPE(librbd::journal::ClientData)
 TYPE(librbd::journal::TagData)
+#include "librbd/mirroring_watcher/Types.h"
+TYPE(librbd::mirroring_watcher::NotifyMessage)
 #include "librbd/WatchNotifyTypes.h"
 TYPE(librbd::watch_notify::NotifyMessage)
 TYPE(librbd::watch_notify::ResponseMessage)
diff --git a/src/test/erasure-code/TestErasureCodePlugin.cc b/src/test/erasure-code/TestErasureCodePlugin.cc
index 4691ff6..5b0518e 100644
--- a/src/test/erasure-code/TestErasureCodePlugin.cc
+++ b/src/test/erasure-code/TestErasureCodePlugin.cc
@@ -30,13 +30,21 @@ protected:
 
   class Thread_factory : public Thread {
   public:
+    static void cleanup(void *arg) {
+      ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+      if (instance.lock.is_locked())
+        instance.lock.Unlock();
+    }
+
     virtual void *entry() {
       ErasureCodeProfile profile;
       ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
       ErasureCodeInterfaceRef erasure_code;
+      pthread_cleanup_push(cleanup, NULL);
       instance.factory("hangs",
 		       g_conf->erasure_code_dir,
 		       profile, &erasure_code, &cerr);
+      pthread_cleanup_pop(0);
       return NULL;
     }
   };
diff --git a/src/test/erasure-code/test-erasure-code.sh b/src/test/erasure-code/test-erasure-code.sh
index 1328766..a8661f3 100755
--- a/src/test/erasure-code/test-erasure-code.sh
+++ b/src/test/erasure-code/test-erasure-code.sh
@@ -25,7 +25,7 @@ function run() {
     export CEPH_MON="127.0.0.1:7101" # git grep '\<7101\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
-    CEPH_ARGS+="--mon-host=$CEPH_MON "
+    CEPH_ARGS+="--mon-host=$CEPH_MON --mon-osd-prime-pg-temp=false"
 
     setup $dir || return 1
     run_mon $dir a || return 1
diff --git a/src/test/fedora-21/ceph.spec.in b/src/test/fedora-21/ceph.spec.in
index b52d7e2..3a5a6f7 100644
--- a/src/test/fedora-21/ceph.spec.in
+++ b/src/test/fedora-21/ceph.spec.in
@@ -27,6 +27,10 @@
 %bcond_with selinux
 %endif
 
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
 
 %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
@@ -62,11 +66,6 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 %{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
 # unify libexec for all targets
 %global _libexecdir %{_exec_prefix}/lib
 
@@ -186,7 +185,7 @@ BuildRequires:  boost-random
 BuildRequires:	python-argparse
 %endif
 # lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:	lttng-ust-devel
 BuildRequires:	libbabeltrace-devel
@@ -685,6 +684,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
                 --libexecdir=%{_libexecdir} \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?rhel} && ! 0%{?centos}
+                --enable-subman \
+%endif
 %if 0%{?_with_systemd}
 		--with-systemdsystemunitdir=%_unitdir \
 %endif
@@ -702,6 +704,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 %endif
 		--with-librocksdb-static=check \
 		--with-radosgw \
+%if %{without lttng}
+		--without-lttng \
+		--without-babeltrace \
+%endif
 		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
 		%{?_with_tcmalloc} \
@@ -858,7 +864,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
@@ -977,7 +983,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
 %{_bindir}/rbdmap
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_bindir}/rbd-replay-prep
 %endif
 %{_bindir}/ceph-post-file
@@ -994,6 +1000,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
 %{_mandir}/man8/rbd-replay-many.8*
 %{_mandir}/man8/rbd-replay-prep.8*
@@ -1017,19 +1024,22 @@ rm -rf $RPM_BUILD_ROOT
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
 
 %pre common
-CEPH_GROUP_ID=""
-CEPH_USER_ID=""
+CEPH_GROUP_ID=167
+CEPH_USER_ID=167
 %if 0%{?rhel} || 0%{?fedora}
-CEPH_GROUP_ID="-g 167"
-CEPH_USER_ID="-u 167"
-%endif
-%if 0%{?rhel} || 0%{?fedora}
-%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
-%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%{_sbindir}/groupadd ceph -g $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph -u $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 %if 0%{?suse_version}
-getent group ceph >/dev/null || groupadd -r ceph
-getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+if ! getent group ceph >/dev/null ; then
+    CEPH_GROUP_ID_OPTION=""
+    getent group $CEPH_GROUP_ID >/dev/null || CEPH_GROUP_ID_OPTION="-g $CEPH_GROUP_ID"
+    groupadd ceph $CEPH_GROUP_ID_OPTION -r 2>/dev/null || :
+fi
+if ! getent passwd ceph >/dev/null ; then
+    CEPH_USER_ID_OPTION=""
+    getent passwd $CEPH_USER_ID >/dev/null || CEPH_USER_ID_OPTION="-u $CEPH_USER_ID"
+    useradd ceph $CEPH_USER_ID_OPTION -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 exit 0
 
@@ -1182,6 +1192,9 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-osd.8*
+%if 0%{?rhel} && ! 0%{?centos}
+/etc/cron.hourly/subman
+%endif
 %if 0%{?_with_systemd}
 %{_unitdir}/ceph-osd at .service
 %{_unitdir}/ceph-osd.target
@@ -1220,7 +1233,7 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so.*
 %endif
 
@@ -1244,7 +1257,7 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so
 %endif
 %{_bindir}/librados-config
@@ -1279,7 +1292,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so.*
 %endif
 
@@ -1299,7 +1312,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so
 %endif
 
diff --git a/src/test/librados/list.cc b/src/test/librados/list.cc
index b7f0253..60b97b5 100644
--- a/src/test/librados/list.cc
+++ b/src/test/librados/list.cc
@@ -5,6 +5,10 @@
 #include "include/stringify.h"
 #include "test/librados/test.h"
 #include "test/librados/TestCase.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "common/ceph_argparse.h"
+#include "common/common_init.h"
 
 #include "include/types.h"
 #include "common/hobject.h"
@@ -956,3 +960,18 @@ TEST_F(LibRadosListPP, EnumerateObjectsFilterPP) {
 
 #pragma GCC diagnostic pop
 #pragma GCC diagnostic warning "-Wpragmas"
+
+int main(int argc, char **argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args);
+  cout << args << std::endl;
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 9cc7f22..ed19e00 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -60,6 +60,9 @@ TEST(LibRadosMiscConnectFailure, ConnectFailure) {
   ASSERT_EQ(0, rados_conf_parse_env(cluster, NULL));
 
   ASSERT_EQ(0, rados_conf_set(cluster, "client_mount_timeout", "0.000000001"));
+  ASSERT_EQ(0, rados_conf_set(cluster, "debug_monc", "20"));
+  ASSERT_EQ(0, rados_conf_set(cluster, "debug_ms", "1"));
+  ASSERT_EQ(0, rados_conf_set(cluster, "log_to_stderr", "true"));
 
   ASSERT_EQ(-ENOTCONN, rados_monitor_log(cluster, "error",
                                          test_rados_log_cb, NULL));
diff --git a/src/test/librados/pool.cc b/src/test/librados/pool.cc
index c314720..2c01ee9 100644
--- a/src/test/librados/pool.cc
+++ b/src/test/librados/pool.cc
@@ -130,7 +130,8 @@ TEST(LibRadosPools, PoolGetBaseTier) {
   ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0));
 
   cmdstr = "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" +
-     tier_pool_name + "\", \"mode\":\"readonly\"}";
+     tier_pool_name + "\", \"mode\":\"readonly\", \"sure\": " +
+    "\"--yes-i-really-mean-it\"}";
   cmd[0] = (char *)cmdstr.c_str();
   ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0));
 
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 38bace6..3f9a5b3 100755
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -619,10 +619,25 @@ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
   cluster.wait_for_latest_osdmap();
 
   // create some whiteouts, verify they behave
-  ASSERT_EQ(0, ioctx.remove("foo"));
+  {
+    ObjectWriteOperation op;
+    op.assert_exists();
+    op.remove();
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
 
-  ASSERT_EQ(-ENOENT, ioctx.remove("bar"));
-  ASSERT_EQ(-ENOENT, ioctx.remove("bar"));
+  {
+    ObjectWriteOperation op;
+    op.assert_exists();
+    op.remove();
+    ASSERT_EQ(-ENOENT, ioctx.operate("bar", &op));
+  }
+  {
+    ObjectWriteOperation op;
+    op.assert_exists();
+    op.remove();
+    ASSERT_EQ(-ENOENT, ioctx.operate("bar", &op));
+  }
 
   // verify the whiteouts are there in the cache tier
   {
@@ -3181,10 +3196,25 @@ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
   cluster.wait_for_latest_osdmap();
 
   // create some whiteouts, verify they behave
-  ASSERT_EQ(0, ioctx.remove("foo"));
+  {
+    ObjectWriteOperation op;
+    op.assert_exists();
+    op.remove();
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
 
-  ASSERT_EQ(-ENOENT, ioctx.remove("bar"));
-  ASSERT_EQ(-ENOENT, ioctx.remove("bar"));
+  {
+    ObjectWriteOperation op;
+    op.assert_exists();
+    op.remove();
+    ASSERT_EQ(-ENOENT, ioctx.operate("bar", &op));
+  }
+  {
+    ObjectWriteOperation op;
+    op.assert_exists();
+    op.remove();
+    ASSERT_EQ(-ENOENT, ioctx.operate("bar", &op));
+  }
 
   // verify the whiteouts are there in the cache tier
   {
@@ -5324,6 +5354,7 @@ int main(int argc, char **argv)
 
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args),
 
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index ad6b395..7176ba4 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -1118,6 +1118,11 @@ int cls_cxx_map_set_vals(cls_method_context_t hctx,
 
 int cls_cxx_read(cls_method_context_t hctx, int ofs, int len,
                  bufferlist *outbl) {
+  return cls_cxx_read2(hctx, ofs, len, outbl, 0);
+}
+
+int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len,
+                  bufferlist *outbl, uint32_t op_flags) {
   librados::TestClassHandler::MethodContext *ctx =
     reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
   return ctx->io_ctx_impl->read(ctx->oid, len, ofs, outbl);
@@ -1138,6 +1143,11 @@ int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime) {
 
 int cls_cxx_write(cls_method_context_t hctx, int ofs, int len,
                   bufferlist *inbl) {
+  return cls_cxx_write2(hctx, ofs, len, inbl, 0);
+}
+
+int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len,
+                   bufferlist *inbl, uint32_t op_flags) {
   librados::TestClassHandler::MethodContext *ctx =
     reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
   return ctx->io_ctx_impl->write(ctx->oid, *inbl, len, ofs, ctx->snapc);
diff --git a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
index 40ccc8f..b15b241 100644
--- a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
+++ b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
@@ -34,6 +34,18 @@ public:
     return io_ctx_impl;
   }
 
+  MOCK_METHOD4(aio_watch, int(const std::string& o, AioCompletionImpl *c,
+                              uint64_t *handle, librados::WatchCtx2 *ctx));
+  int do_aio_watch(const std::string& o, AioCompletionImpl *c,
+                   uint64_t *handle, librados::WatchCtx2 *ctx) {
+    return TestMemIoCtxImpl::aio_watch(o, c, handle, ctx);
+  }
+
+  MOCK_METHOD2(aio_unwatch, int(uint64_t handle, AioCompletionImpl *c));
+  int do_aio_unwatch(uint64_t handle, AioCompletionImpl *c) {
+    return TestMemIoCtxImpl::aio_unwatch(handle, c);
+  }
+
   MOCK_METHOD7(exec, int(const std::string& oid,
                          TestClassHandler *handler,
                          const char *cls,
@@ -60,6 +72,13 @@ public:
     return TestMemIoCtxImpl::list_watchers(o, out_watchers);
   }
 
+  MOCK_METHOD4(notify, int(const std::string& o, bufferlist& bl,
+                           uint64_t timeout_ms, bufferlist *pbl));
+  int do_notify(const std::string& o, bufferlist& bl,
+                uint64_t timeout_ms, bufferlist *pbl) {
+    return TestMemIoCtxImpl::notify(o, bl, timeout_ms, pbl);
+  }
+
   MOCK_METHOD4(read, int(const std::string& oid,
                          size_t len,
                          uint64_t off,
@@ -116,9 +135,12 @@ public:
   void default_to_parent() {
     using namespace ::testing;
 
+    ON_CALL(*this, aio_watch(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_aio_watch));
+    ON_CALL(*this, aio_unwatch(_, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_aio_unwatch));
     ON_CALL(*this, exec(_, _, _, _, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_exec));
     ON_CALL(*this, list_snaps(_, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_list_snaps));
     ON_CALL(*this, list_watchers(_, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_list_watchers));
+    ON_CALL(*this, notify(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_notify));
     ON_CALL(*this, read(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_read));
     ON_CALL(*this, remove(_, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_remove));
     ON_CALL(*this, selfmanaged_snap_create(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_create));
diff --git a/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc b/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
index 9d67982..c7c9428 100644
--- a/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
+++ b/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
@@ -5,6 +5,7 @@
 #include "test/librbd/test_support.h"
 #include "test/librbd/mock/MockImageCtx.h"
 #include "test/librbd/mock/MockJournal.h"
+#include "test/librbd/mock/MockJournalPolicy.h"
 #include "test/librbd/mock/MockObjectMap.h"
 #include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
 #include "test/librados_test_stub/MockTestMemRadosClient.h"
@@ -87,14 +88,17 @@ public:
                   .WillOnce(CompleteContext(0, mock_image_ctx.image_ctx->op_work_queue));
   }
 
-  void expect_is_journal_tag_owner(MockJournal &mock_journal, bool owner) {
-    EXPECT_CALL(mock_journal, is_tag_owner()).WillOnce(Return(owner));
+  void expect_get_journal_policy(MockImageCtx &mock_image_ctx,
+                                 MockJournalPolicy &mock_journal_policy) {
+    EXPECT_CALL(mock_image_ctx, get_journal_policy())
+                  .WillOnce(Return(&mock_journal_policy));
   }
 
   void expect_allocate_journal_tag(MockImageCtx &mock_image_ctx,
-                                   MockJournal &mock_journal, int r) {
-    EXPECT_CALL(mock_journal, allocate_tag("", _))
-                  .WillOnce(WithArg<1>(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue)));
+                                   MockJournalPolicy &mock_journal_policy,
+                                   int r) {
+    EXPECT_CALL(mock_journal_policy, allocate_tag_on_lock(_))
+                  .WillOnce(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue));
   }
 
   void expect_get_lock_info(MockImageCtx &mock_image_ctx, int r,
@@ -185,11 +189,12 @@ TEST_F(TestMockExclusiveLockAcquireRequest, Success) {
   expect_open_object_map(mock_image_ctx, mock_object_map);
 
   MockJournal mock_journal;
+  MockJournalPolicy mock_journal_policy;
   expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
   expect_create_journal(mock_image_ctx, &mock_journal);
   expect_open_journal(mock_image_ctx, mock_journal, 0);
-  expect_is_journal_tag_owner(mock_journal, true);
-  expect_allocate_journal_tag(mock_image_ctx, mock_journal, 0);
+  expect_get_journal_policy(mock_image_ctx, mock_journal_policy);
+  expect_allocate_journal_tag(mock_image_ctx, mock_journal_policy, 0);
 
   C_SaferCond acquire_ctx;
   C_SaferCond ctx;
@@ -247,11 +252,12 @@ TEST_F(TestMockExclusiveLockAcquireRequest, SuccessObjectMapDisabled) {
   expect_test_features(mock_image_ctx, RBD_FEATURE_OBJECT_MAP, false);
 
   MockJournal mock_journal;
+  MockJournalPolicy mock_journal_policy;
   expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
   expect_create_journal(mock_image_ctx, &mock_journal);
   expect_open_journal(mock_image_ctx, mock_journal, 0);
-  expect_is_journal_tag_owner(mock_journal, true);
-  expect_allocate_journal_tag(mock_image_ctx, mock_journal, 0);
+  expect_get_journal_policy(mock_image_ctx, mock_journal_policy);
+  expect_allocate_journal_tag(mock_image_ctx, mock_journal_policy, 0);
 
   C_SaferCond acquire_ctx;
   C_SaferCond ctx;
@@ -297,41 +303,6 @@ TEST_F(TestMockExclusiveLockAcquireRequest, JournalError) {
   ASSERT_EQ(-EINVAL, ctx.wait());
 }
 
-TEST_F(TestMockExclusiveLockAcquireRequest, NotJournalTagOwner) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-
-  MockImageCtx mock_image_ctx(*ictx);
-  expect_op_work_queue(mock_image_ctx);
-
-  InSequence seq;
-  expect_flush_notifies(mock_image_ctx);
-  expect_lock(mock_image_ctx, 0);
-
-  MockObjectMap *mock_object_map = new MockObjectMap();
-  expect_test_features(mock_image_ctx, RBD_FEATURE_OBJECT_MAP, true);
-  expect_create_object_map(mock_image_ctx, mock_object_map);
-  expect_open_object_map(mock_image_ctx, *mock_object_map);
-
-  MockJournal *mock_journal = new MockJournal();
-  expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
-  expect_create_journal(mock_image_ctx, mock_journal);
-  expect_open_journal(mock_image_ctx, *mock_journal, 0);
-  expect_is_journal_tag_owner(*mock_journal, false);
-  expect_close_journal(mock_image_ctx, *mock_journal);
-  expect_close_object_map(mock_image_ctx, *mock_object_map);
-
-  C_SaferCond acquire_ctx;
-  C_SaferCond ctx;
-  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
-                                                       TEST_COOKIE,
-                                                       &acquire_ctx, &ctx);
-  req->send();
-  ASSERT_EQ(-EPERM, ctx.wait());
-}
-
 TEST_F(TestMockExclusiveLockAcquireRequest, AllocateJournalTagError) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -351,11 +322,12 @@ TEST_F(TestMockExclusiveLockAcquireRequest, AllocateJournalTagError) {
   expect_open_object_map(mock_image_ctx, *mock_object_map);
 
   MockJournal *mock_journal = new MockJournal();
+  MockJournalPolicy mock_journal_policy;
   expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
   expect_create_journal(mock_image_ctx, mock_journal);
   expect_open_journal(mock_image_ctx, *mock_journal, 0);
-  expect_is_journal_tag_owner(*mock_journal, true);
-  expect_allocate_journal_tag(mock_image_ctx, *mock_journal, -ESTALE);
+  expect_get_journal_policy(mock_image_ctx, mock_journal_policy);
+  expect_allocate_journal_tag(mock_image_ctx, mock_journal_policy, -EPERM);
   expect_close_journal(mock_image_ctx, *mock_journal);
   expect_close_object_map(mock_image_ctx, *mock_object_map);
 
@@ -365,7 +337,7 @@ TEST_F(TestMockExclusiveLockAcquireRequest, AllocateJournalTagError) {
                                                        TEST_COOKIE,
                                                        &acquire_ctx, &ctx);
   req->send();
-  ASSERT_EQ(-ESTALE, ctx.wait());
+  ASSERT_EQ(-EPERM, ctx.wait());
 }
 
 TEST_F(TestMockExclusiveLockAcquireRequest, LockBusy) {
diff --git a/src/test/librbd/mock/MockImageCtx.cc b/src/test/librbd/mock/MockImageCtx.cc
new file mode 100644
index 0000000..b207eab
--- /dev/null
+++ b/src/test/librbd/mock/MockImageCtx.cc
@@ -0,0 +1,10 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/mock/MockImageCtx.h"
+
+namespace librbd {
+
+MockImageCtx* MockImageCtx::s_instance = nullptr;
+
+} // namespace librbd
diff --git a/src/test/librbd/mock/MockImageCtx.h b/src/test/librbd/mock/MockImageCtx.h
index 60d5fbb..170e9a1 100644
--- a/src/test/librbd/mock/MockImageCtx.h
+++ b/src/test/librbd/mock/MockImageCtx.h
@@ -4,6 +4,7 @@
 #ifndef CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
 #define CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
 
+#include "include/rados/librados.hpp"
 #include "test/librbd/mock/MockAioImageRequestWQ.h"
 #include "test/librbd/mock/MockContextWQ.h"
 #include "test/librbd/mock/MockExclusiveLock.h"
@@ -17,6 +18,7 @@
 #include "common/WorkQueue.h"
 #include "librbd/ImageCtx.h"
 #include "gmock/gmock.h"
+#include <string>
 
 namespace librbd {
 
@@ -25,6 +27,15 @@ template <typename> class ResizeRequest;
 }
 
 struct MockImageCtx {
+  static MockImageCtx *s_instance;
+  static MockImageCtx *create(const std::string &image_name,
+                              const std::string &image_id,
+                              const char *snap, librados::IoCtx& p,
+                              bool read_only) {
+    assert(s_instance != nullptr);
+    return s_instance;
+  }
+
   MockImageCtx(librbd::ImageCtx &image_ctx)
     : image_ctx(&image_ctx),
       cct(image_ctx.cct),
@@ -55,6 +66,7 @@ struct MockImageCtx {
       object_prefix(image_ctx.object_prefix),
       header_oid(image_ctx.header_oid),
       id(image_ctx.id),
+      name(image_ctx.name),
       parent_md(image_ctx.parent_md),
       layout(image_ctx.layout),
       aio_work_queue(new MockAioImageRequestWQ()),
@@ -148,6 +160,8 @@ struct MockImageCtx {
   MOCK_METHOD0(notify_update, void());
   MOCK_METHOD1(notify_update, void(Context *));
 
+  MOCK_CONST_METHOD0(get_journal_policy, journal::Policy*());
+
   ImageCtx *image_ctx;
   CephContext *cct;
 
@@ -188,6 +202,7 @@ struct MockImageCtx {
   std::string object_prefix;
   std::string header_oid;
   std::string id;
+  std::string name;
   parent_info parent_md;
 
   file_layout_t layout;
diff --git a/src/test/librbd/mock/MockImageState.h b/src/test/librbd/mock/MockImageState.h
index 6f75ecd..8f5f206 100644
--- a/src/test/librbd/mock/MockImageState.h
+++ b/src/test/librbd/mock/MockImageState.h
@@ -13,6 +13,9 @@ namespace librbd {
 struct MockImageState {
   MOCK_CONST_METHOD0(is_refresh_required, bool());
   MOCK_METHOD1(refresh, void(Context*));
+
+  MOCK_METHOD0(close, int());
+  MOCK_METHOD1(close, void(Context*));
 };
 
 } // namespace librbd
diff --git a/src/test/librbd/mock/MockJournal.h b/src/test/librbd/mock/MockJournal.h
index 1393399..a4637a4 100644
--- a/src/test/librbd/mock/MockJournal.h
+++ b/src/test/librbd/mock/MockJournal.h
@@ -17,7 +17,12 @@ struct MockJournal {
   MOCK_METHOD1(wait_for_journal_ready, void(Context *));
 
   MOCK_CONST_METHOD0(is_tag_owner, bool());
-  MOCK_METHOD2(allocate_tag, void(const std::string &, Context *));
+  MOCK_METHOD6(allocate_tag, void(const std::string &mirror_uuid,
+                                  const std::string &predecessor_mirror_uuid,
+                                  bool predecessor_commit_valid,
+                                  uint64_t predecessor_tag_tid,
+                                  uint64_t predecessor_entry_tid,
+                                  Context *on_finish));
 
   MOCK_METHOD1(open, void(Context *));
   MOCK_METHOD1(close, void(Context *));
diff --git a/src/test/librbd/mock/MockJournalPolicy.h b/src/test/librbd/mock/MockJournalPolicy.h
new file mode 100644
index 0000000..e7debfa
--- /dev/null
+++ b/src/test/librbd/mock/MockJournalPolicy.h
@@ -0,0 +1,21 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_JOURNAL_POLICY_H
+#define CEPH_TEST_LIBRBD_MOCK_JOURNAL_POLICY_H
+
+#include "librbd/journal/Policy.h"
+#include "gmock/gmock.h"
+
+namespace librbd {
+
+struct MockJournalPolicy : public journal::Policy {
+
+  MOCK_METHOD1(allocate_tag_on_lock, void(Context*));
+  MOCK_METHOD1(cancel_external_replay, void(Context*));
+
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_JOURNAL_POLICY_H
diff --git a/src/test/librbd/mock/MockOperations.h b/src/test/librbd/mock/MockOperations.h
index 6150125..c92578a 100644
--- a/src/test/librbd/mock/MockOperations.h
+++ b/src/test/librbd/mock/MockOperations.h
@@ -5,6 +5,7 @@
 #define CEPH_TEST_LIBRBD_MOCK_OPERATIONS_H
 
 #include "include/int_types.h"
+#include "include/rbd/librbd.hpp"
 #include "gmock/gmock.h"
 
 class Context;
diff --git a/src/test/librbd/test_MirroringWatcher.cc b/src/test/librbd/test_MirroringWatcher.cc
new file mode 100644
index 0000000..1508499
--- /dev/null
+++ b/src/test/librbd/test_MirroringWatcher.cc
@@ -0,0 +1,100 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_fixture.h"
+#include "test/librbd/test_support.h"
+#include "include/rbd_types.h"
+#include "librbd/MirroringWatcher.h"
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+#include <list>
+
+void register_test_mirroring_watcher() {
+}
+
+namespace librbd {
+
+namespace {
+
+struct MockMirroringWatcher : public MirroringWatcher<> {
+  std::string oid;
+
+  MockMirroringWatcher(ImageCtx &image_ctx)
+    : MirroringWatcher<>(image_ctx.md_ctx, image_ctx.op_work_queue) {
+  }
+
+  MOCK_METHOD2(handle_mode_updated, void(cls::rbd::MirrorMode, Context*));
+  MOCK_METHOD4(handle_image_updated, void(cls::rbd::MirrorImageState,
+                                          const std::string &,
+                                          const std::string &,
+                                          Context*));
+};
+
+} // anonymous namespace
+
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::StrEq;
+using ::testing::WithArg;
+
+class TestMirroringWatcher : public TestFixture {
+public:
+  virtual void SetUp() {
+    TestFixture::SetUp();
+
+    bufferlist bl;
+    ASSERT_EQ(0, m_ioctx.write_full(RBD_MIRRORING, bl));
+
+    librbd::ImageCtx *ictx;
+    ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+    m_image_watcher = new MockMirroringWatcher(*ictx);
+    C_SaferCond ctx;
+    m_image_watcher->register_watch(&ctx);
+    if (ctx.wait() != 0) {
+      delete m_image_watcher;
+      m_image_watcher = nullptr;
+      FAIL();
+    }
+  }
+
+  virtual void TearDown() {
+    if (m_image_watcher != nullptr) {
+      C_SaferCond ctx;
+      m_image_watcher->unregister_watch(&ctx);
+      ASSERT_EQ(0, ctx.wait());
+      delete m_image_watcher;
+    }
+
+    TestFixture::TearDown();
+  }
+
+  MockMirroringWatcher *m_image_watcher = nullptr;
+};
+
+TEST_F(TestMirroringWatcher, ModeUpdated) {
+  EXPECT_CALL(*m_image_watcher, handle_mode_updated(cls::rbd::MIRROR_MODE_DISABLED, _))
+    .WillRepeatedly(WithArg<1>(Invoke([](Context *on_finish) {
+        on_finish->complete(0);
+      })));
+
+  ASSERT_EQ(0, MockMirroringWatcher::notify_mode_updated(m_ioctx, cls::rbd::MIRROR_MODE_DISABLED));
+
+}
+
+TEST_F(TestMirroringWatcher, ImageStatusUpdated) {
+  EXPECT_CALL(*m_image_watcher,
+              handle_image_updated(cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
+                                   StrEq("image id"), StrEq("global image id"),
+                                   _))
+    .WillRepeatedly(WithArg<3>(Invoke([](Context *on_finish) {
+        on_finish->complete(0);
+      })));
+
+  ASSERT_EQ(0, MockMirroringWatcher::notify_image_updated(m_ioctx,
+                                                          cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
+                                                          "image id",
+                                                          "global image id"));
+}
+
+} // namespace librbd
diff --git a/src/test/librbd/test_main.cc b/src/test/librbd/test_main.cc
index c3d7002..4ae9f43 100644
--- a/src/test/librbd/test_main.cc
+++ b/src/test/librbd/test_main.cc
@@ -16,6 +16,7 @@ extern void register_test_journal_entries();
 extern void register_test_journal_replay();
 extern void register_test_object_map();
 extern void register_test_mirroring();
+extern void register_test_mirroring_watcher();
 #endif // TEST_LIBRBD_INTERNALS
 
 int main(int argc, char **argv)
@@ -28,6 +29,7 @@ int main(int argc, char **argv)
   register_test_journal_replay();
   register_test_object_map();
   register_test_mirroring();
+  register_test_mirroring_watcher();
 #endif // TEST_LIBRBD_INTERNALS
 
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/librbd/test_mirroring.cc b/src/test/librbd/test_mirroring.cc
index 6ba7eac..e7ccf3f 100644
--- a/src/test/librbd/test_mirroring.cc
+++ b/src/test/librbd/test_mirroring.cc
@@ -42,9 +42,14 @@ public:
     TestFixture::TearDown();
   }
 
+  virtual void SetUp() {
+    ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), m_ioctx));
+  }
+
   std::string image_name = "mirrorimg1";
 
-  void check_mirror_image_enable(uint64_t features,
+  void check_mirror_image_enable(rbd_mirror_mode_t mirror_mode,
+                                 uint64_t features,
                                  int expected_r,
                                  rbd_mirror_image_state_t mirror_state) {
 
@@ -55,6 +60,8 @@ public:
     librbd::Image image;
     ASSERT_EQ(0, m_rbd.open(m_ioctx, image, image_name.c_str()));
 
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, mirror_mode));
+
     ASSERT_EQ(expected_r, image.mirror_image_enable());
 
     librbd::mirror_image_info_t mirror_image;
@@ -63,6 +70,32 @@ public:
 
     ASSERT_EQ(0, image.close());
     ASSERT_EQ(0, m_rbd.remove(m_ioctx, image_name.c_str()));
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_DISABLED));
+  }
+
+  void check_mirror_image_disable(rbd_mirror_mode_t mirror_mode,
+                                  uint64_t features,
+                                  int expected_r,
+                                  rbd_mirror_image_state_t mirror_state) {
+
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_POOL));
+
+    int order = 20;
+    ASSERT_EQ(0, m_rbd.create2(m_ioctx, image_name.c_str(), 4096, features, &order));
+    librbd::Image image;
+    ASSERT_EQ(0, m_rbd.open(m_ioctx, image, image_name.c_str()));
+
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, mirror_mode));
+
+    ASSERT_EQ(expected_r, image.mirror_image_disable(false));
+
+    librbd::mirror_image_info_t mirror_image;
+    ASSERT_EQ(0, image.mirror_image_get_info(&mirror_image, sizeof(mirror_image)));
+    ASSERT_EQ(mirror_state, mirror_image.state);
+
+    ASSERT_EQ(0, image.close());
+    ASSERT_EQ(0, m_rbd.remove(m_ioctx, image_name.c_str()));
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_DISABLED));
   }
 
   void check_mirroring_on_create(uint64_t features,
@@ -82,6 +115,7 @@ public:
 
     ASSERT_EQ(0, image.close());
     ASSERT_EQ(0, m_rbd.remove(m_ioctx, image_name.c_str()));
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_DISABLED));
   }
 
   void check_mirroring_on_update_features(uint64_t init_features,
@@ -106,25 +140,133 @@ public:
     librbd::mirror_image_info_t mirror_image;
     ASSERT_EQ(0, image.mirror_image_get_info(&mirror_image, sizeof(mirror_image)));
     ASSERT_EQ(mirror_state, mirror_image.state);
+
     ASSERT_EQ(0, image.close());
     ASSERT_EQ(0, m_rbd.remove(m_ioctx, image_name.c_str()));
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_DISABLED));
+  }
+
+  void setup_images_with_mirror_mode(rbd_mirror_mode_t mirror_mode,
+                                        std::vector<uint64_t>& features_vec) {
+
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, mirror_mode));
+
+    int id = 1;
+    int order = 20;
+    for (const auto& features : features_vec) {
+      std::stringstream img_name("img_");
+      img_name << id++;
+      std::string img_name_str = img_name.str();
+      ASSERT_EQ(0, m_rbd.create2(m_ioctx, img_name_str.c_str(), 2048, features, &order));
+    }
+  }
+
+  void check_mirroring_on_mirror_mode_set(rbd_mirror_mode_t mirror_mode,
+                            std::vector<rbd_mirror_image_state_t>& states_vec) {
+
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, mirror_mode));
+
+    std::vector< std::tuple<std::string, rbd_mirror_image_state_t> > images;
+    int id = 1;
+    for (const auto& mirror_state : states_vec) {
+      std::stringstream img_name("img_");
+      img_name << id++;
+      std::string img_name_str = img_name.str();
+      librbd::Image image;
+      ASSERT_EQ(0, m_rbd.open(m_ioctx, image, img_name_str.c_str()));
+      images.push_back(std::make_tuple(img_name_str, mirror_state));
+
+      librbd::mirror_image_info_t mirror_image;
+      ASSERT_EQ(0, image.mirror_image_get_info(&mirror_image, sizeof(mirror_image)));
+      ASSERT_EQ(mirror_state, mirror_image.state);
+
+      ASSERT_EQ(0, image.close());
+      ASSERT_EQ(0, m_rbd.remove(m_ioctx, img_name_str.c_str()));
+    }
+  }
+
+  void check_remove_image(rbd_mirror_mode_t mirror_mode, uint64_t features,
+                          bool enable_mirroring) {
+
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, mirror_mode));
+
+    int order = 20;
+    ASSERT_EQ(0, m_rbd.create2(m_ioctx, image_name.c_str(), 4096, features,
+              &order));
+    librbd::Image image;
+    ASSERT_EQ(0, m_rbd.open(m_ioctx, image, image_name.c_str()));
+
+    if (enable_mirroring) {
+      ASSERT_EQ(0, image.mirror_image_enable());
+    }
+
+    image.close();
+    ASSERT_EQ(0, m_rbd.remove(m_ioctx, image_name.c_str()));
+    ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_DISABLED));
   }
 
 };
 
-TEST_F(TestMirroring, EnableImageMirror) {
+TEST_F(TestMirroring, EnableImageMirror_In_MirrorModeImage) {
   uint64_t features = 0;
   features |= RBD_FEATURE_OBJECT_MAP;
   features |= RBD_FEATURE_EXCLUSIVE_LOCK;
   features |= RBD_FEATURE_JOURNALING;
-  check_mirror_image_enable(features, 0, RBD_MIRROR_IMAGE_ENABLED);
+  check_mirror_image_enable(RBD_MIRROR_MODE_IMAGE, features, 0,
+      RBD_MIRROR_IMAGE_ENABLED);
+}
+
+TEST_F(TestMirroring, EnableImageMirror_In_MirrorModePool) {
+  uint64_t features = 0;
+  features |= RBD_FEATURE_OBJECT_MAP;
+  features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+  features |= RBD_FEATURE_JOURNALING;
+  check_mirror_image_enable(RBD_MIRROR_MODE_POOL, features, -EINVAL,
+      RBD_MIRROR_IMAGE_ENABLED);
+}
+
+TEST_F(TestMirroring, EnableImageMirror_In_MirrorModeDisabled) {
+  uint64_t features = 0;
+  features |= RBD_FEATURE_OBJECT_MAP;
+  features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+  features |= RBD_FEATURE_JOURNALING;
+  check_mirror_image_enable(RBD_MIRROR_MODE_DISABLED, features, -EINVAL,
+      RBD_MIRROR_IMAGE_DISABLED);
+}
+
+TEST_F(TestMirroring, DisableImageMirror_In_MirrorModeImage) {
+  uint64_t features = 0;
+  features |= RBD_FEATURE_OBJECT_MAP;
+  features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+  features |= RBD_FEATURE_JOURNALING;
+  check_mirror_image_disable(RBD_MIRROR_MODE_IMAGE, features, 0,
+      RBD_MIRROR_IMAGE_DISABLED);
+}
+
+TEST_F(TestMirroring, DisableImageMirror_In_MirrorModePool) {
+  uint64_t features = 0;
+  features |= RBD_FEATURE_OBJECT_MAP;
+  features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+  features |= RBD_FEATURE_JOURNALING;
+  check_mirror_image_disable(RBD_MIRROR_MODE_POOL, features, -EINVAL,
+      RBD_MIRROR_IMAGE_ENABLED);
+}
+
+TEST_F(TestMirroring, DisableImageMirror_In_MirrorModeDisabled) {
+  uint64_t features = 0;
+  features |= RBD_FEATURE_OBJECT_MAP;
+  features |= RBD_FEATURE_EXCLUSIVE_LOCK;
+  features |= RBD_FEATURE_JOURNALING;
+  check_mirror_image_disable(RBD_MIRROR_MODE_DISABLED, features, -EINVAL,
+      RBD_MIRROR_IMAGE_DISABLED);
 }
 
 TEST_F(TestMirroring, EnableImageMirror_WithoutJournaling) {
   uint64_t features = 0;
   features |= RBD_FEATURE_OBJECT_MAP;
   features |= RBD_FEATURE_EXCLUSIVE_LOCK;
-  check_mirror_image_enable(features, -EINVAL, RBD_MIRROR_IMAGE_DISABLED);
+  check_mirror_image_enable(RBD_MIRROR_MODE_DISABLED, features, -EINVAL,
+      RBD_MIRROR_IMAGE_DISABLED);
 }
 
 TEST_F(TestMirroring, CreateImage_In_MirrorModeDisabled) {
@@ -217,3 +359,104 @@ TEST_F(TestMirroring, DisableJournaling_In_MirrorModeImage) {
                       RBD_MIRROR_MODE_IMAGE, RBD_MIRROR_IMAGE_ENABLED);
 }
 
+TEST_F(TestMirroring, MirrorModeSet_DisabledMode_To_PoolMode) {
+  std::vector<uint64_t> features_vec;
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK);
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+
+  setup_images_with_mirror_mode(RBD_MIRROR_MODE_DISABLED, features_vec);
+
+  std::vector<rbd_mirror_image_state_t> states_vec;
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  states_vec.push_back(RBD_MIRROR_IMAGE_ENABLED);
+  check_mirroring_on_mirror_mode_set(RBD_MIRROR_MODE_POOL, states_vec);
+}
+
+TEST_F(TestMirroring, MirrorModeSet_PoolMode_To_DisabledMode) {
+  std::vector<uint64_t> features_vec;
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK);
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+
+  setup_images_with_mirror_mode(RBD_MIRROR_MODE_POOL, features_vec);
+
+  std::vector<rbd_mirror_image_state_t> states_vec;
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  check_mirroring_on_mirror_mode_set(RBD_MIRROR_MODE_DISABLED, states_vec);
+}
+
+TEST_F(TestMirroring, MirrorModeSet_DisabledMode_To_ImageMode) {
+  std::vector<uint64_t> features_vec;
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK);
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+
+  setup_images_with_mirror_mode(RBD_MIRROR_MODE_DISABLED, features_vec);
+
+  std::vector<rbd_mirror_image_state_t> states_vec;
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  check_mirroring_on_mirror_mode_set(RBD_MIRROR_MODE_IMAGE, states_vec);
+}
+
+
+TEST_F(TestMirroring, MirrorModeSet_PoolMode_To_ImageMode) {
+  std::vector<uint64_t> features_vec;
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK);
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+
+  setup_images_with_mirror_mode(RBD_MIRROR_MODE_POOL, features_vec);
+
+  std::vector<rbd_mirror_image_state_t> states_vec;
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  states_vec.push_back(RBD_MIRROR_IMAGE_ENABLED);
+  check_mirroring_on_mirror_mode_set(RBD_MIRROR_MODE_IMAGE, states_vec);
+}
+
+TEST_F(TestMirroring, MirrorModeSet_ImageMode_To_PoolMode) {
+  std::vector<uint64_t> features_vec;
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK);
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+
+  setup_images_with_mirror_mode(RBD_MIRROR_MODE_IMAGE, features_vec);
+
+  std::vector<rbd_mirror_image_state_t> states_vec;
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  states_vec.push_back(RBD_MIRROR_IMAGE_ENABLED);
+  check_mirroring_on_mirror_mode_set(RBD_MIRROR_MODE_POOL, states_vec);
+}
+
+TEST_F(TestMirroring, MirrorModeSet_ImageMode_To_DisabledMode) {
+  std::vector<uint64_t> features_vec;
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK);
+  features_vec.push_back(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+
+  setup_images_with_mirror_mode(RBD_MIRROR_MODE_POOL, features_vec);
+
+  ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_IMAGE));
+  ASSERT_EQ(-EINVAL, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_DISABLED));
+  ASSERT_EQ(0, m_rbd.mirror_mode_set(m_ioctx, RBD_MIRROR_MODE_POOL));
+
+  std::vector<rbd_mirror_image_state_t> states_vec;
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  states_vec.push_back(RBD_MIRROR_IMAGE_DISABLED);
+  check_mirroring_on_mirror_mode_set(RBD_MIRROR_MODE_DISABLED, states_vec);
+}
+
+TEST_F(TestMirroring, RemoveImage_With_MirrorImageEnabled) {
+  check_remove_image(RBD_MIRROR_MODE_IMAGE,
+                     RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING,
+                     true);
+}
+
+TEST_F(TestMirroring, RemoveImage_With_MirrorImageDisabled) {
+  check_remove_image(RBD_MIRROR_MODE_IMAGE,
+                     RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING,
+                     false);
+}
+
+TEST_F(TestMirroring, RemoveImage_With_ImageWithoutJournal) {
+  check_remove_image(RBD_MIRROR_MODE_IMAGE,
+                     RBD_FEATURE_EXCLUSIVE_LOCK,
+                     false);
+}
+
diff --git a/src/test/librbd/test_mock_Journal.cc b/src/test/librbd/test_mock_Journal.cc
index 050a4d0..881ac16 100644
--- a/src/test/librbd/test_mock_Journal.cc
+++ b/src/test/librbd/test_mock_Journal.cc
@@ -12,6 +12,7 @@
 #include "librbd/Utils.h"
 #include "librbd/journal/Replay.h"
 #include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <functional>
diff --git a/src/test/librbd/test_mock_ObjectWatcher.cc b/src/test/librbd/test_mock_ObjectWatcher.cc
new file mode 100644
index 0000000..f626614
--- /dev/null
+++ b/src/test/librbd/test_mock_ObjectWatcher.cc
@@ -0,0 +1,405 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "test/librados_test_stub/MockTestMemRadosClient.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "librados/AioCompletionImpl.h"
+#include "librbd/ObjectWatcher.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <list>
+
+namespace librbd {
+
+namespace {
+
+struct MockObjectWatcher : public ObjectWatcher<MockImageCtx> {
+  std::string oid;
+
+  MockObjectWatcher(MockImageCtx &mock_image_ctx, const std::string &oid)
+    : ObjectWatcher<MockImageCtx>(mock_image_ctx.md_ctx,
+                                  mock_image_ctx.op_work_queue),
+      oid(oid) {
+  }
+
+  virtual std::string get_oid() const override {
+    return oid;
+  }
+
+  virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+                             bufferlist &bl) {
+  }
+};
+
+} // anonymous namespace
+
+} // namespace librbd
+
+// template definitions
+#include "librbd/ObjectWatcher.cc"
+template class librbd::ObjectWatcher<librbd::MockImageCtx>;
+
+namespace librbd {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Invoke;
+using ::testing::InSequence;
+using ::testing::Return;
+using ::testing::SaveArg;
+using ::testing::WithArg;
+
+class TestMockObjectWatcher : public TestMockFixture {
+public:
+  TestMockObjectWatcher() : m_lock("TestMockObjectWatcher::m_lock") {
+  }
+
+  virtual void SetUp() {
+    TestMockFixture::SetUp();
+
+    m_oid = get_temp_image_name();
+
+    bufferlist bl;
+    ASSERT_EQ(0, m_ioctx.write_full(m_oid, bl));
+  }
+
+  void expect_aio_watch(MockImageCtx &mock_image_ctx, int r,
+                        const std::function<void()> &action = std::function<void()>()) {
+    librados::MockTestMemIoCtxImpl &mock_io_ctx(get_mock_io_ctx(
+      mock_image_ctx.md_ctx));
+    librados::MockTestMemRadosClient *mock_rados_client(
+      mock_io_ctx.get_mock_rados_client());
+
+    auto &expect = EXPECT_CALL(mock_io_ctx, aio_watch(m_oid, _, _, _));
+    if (r < 0) {
+      expect.WillOnce(DoAll(WithArg<1>(Invoke([this, mock_rados_client, r, action](librados::AioCompletionImpl *c) {
+                                if (action) {
+                                  action();
+                                }
+
+                                c->get();
+                                mock_rados_client->finish_aio_completion(c, r);
+                                notify_watch();
+                              })),
+                            Return(0)));
+    } else {
+      expect.WillOnce(DoAll(SaveArg<3>(&m_watch_ctx),
+                            Invoke([this, &mock_io_ctx, action](const std::string& o,
+                                                                librados::AioCompletionImpl *c,
+                                                                uint64_t *handle,
+                                                                librados::WatchCtx2 *ctx) {
+                                if (action) {
+                                  action();
+                                }
+
+                                mock_io_ctx.do_aio_watch(o, c, handle, ctx);
+                                notify_watch();
+                              }),
+                            Return(0)));
+    }
+  }
+
+  void expect_aio_unwatch(MockImageCtx &mock_image_ctx, int r,
+                          const std::function<void()> &action = std::function<void()>()) {
+    librados::MockTestMemIoCtxImpl &mock_io_ctx(get_mock_io_ctx(
+      mock_image_ctx.md_ctx));
+
+    auto &expect = EXPECT_CALL(mock_io_ctx, aio_unwatch(_, _));
+    if (r < 0) {
+      expect.WillOnce(DoAll(Invoke([this, &mock_io_ctx, r, action](uint64_t handle,
+                                                                   librados::AioCompletionImpl *c) {
+                                if (action) {
+                                  action();
+                                }
+
+                                librados::AioCompletionImpl *dummy_c = new librados::AioCompletionImpl();
+                                mock_io_ctx.do_aio_unwatch(handle, dummy_c);
+                                ASSERT_EQ(0, dummy_c->wait_for_complete());
+                                dummy_c->release();
+
+                                c->get();
+                                mock_io_ctx.get_mock_rados_client()->finish_aio_completion(c, r);
+                                notify_watch();
+                              }),
+                            Return(0)));
+    } else {
+      expect.WillOnce(DoAll(Invoke([this, &mock_io_ctx, action](uint64_t handle,
+                                                                librados::AioCompletionImpl *c) {
+                                if (action) {
+                                  action();
+                                }
+
+                                mock_io_ctx.do_aio_unwatch(handle, c);
+                                notify_watch();
+                              }),
+                            Return(0)));
+    }
+  }
+
+  std::string m_oid;
+  librados::WatchCtx2 *m_watch_ctx = nullptr;
+
+  void notify_watch() {
+    Mutex::Locker locker(m_lock);
+    ++m_watch_count;
+    m_cond.Signal();
+  }
+
+  bool wait_for_watch(MockImageCtx &mock_image_ctx, size_t count) {
+    Mutex::Locker locker(m_lock);
+    while (m_watch_count < count) {
+      if (m_cond.WaitInterval(mock_image_ctx.cct, m_lock,
+                              utime_t(10, 0)) != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Mutex m_lock;
+  Cond m_cond;
+  size_t m_watch_count = 0;
+};
+
+TEST_F(TestMockObjectWatcher, Success) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  C_SaferCond unregister_ctx;
+  mock_image_watcher.unregister_watch(&unregister_ctx);
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, RegisterError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, -EINVAL);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(-EINVAL, register_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, UnregisterError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, -EINVAL);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  C_SaferCond unregister_ctx;
+  mock_image_watcher.unregister_watch(&unregister_ctx);
+  ASSERT_EQ(-EINVAL, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, Reregister) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  assert(m_watch_ctx != nullptr);
+  m_watch_ctx->handle_error(0, -ESHUTDOWN);
+
+  // wait for recovery unwatch/watch
+  ASSERT_TRUE(wait_for_watch(mock_image_ctx, 3));
+
+  C_SaferCond unregister_ctx;
+  mock_image_watcher.unregister_watch(&unregister_ctx);
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, ReregisterUnwatchError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, -EINVAL);
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  assert(m_watch_ctx != nullptr);
+  m_watch_ctx->handle_error(0, -ESHUTDOWN);
+
+  // wait for recovery unwatch/watch
+  ASSERT_TRUE(wait_for_watch(mock_image_ctx, 3));
+
+  C_SaferCond unregister_ctx;
+  mock_image_watcher.unregister_watch(&unregister_ctx);
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, ReregisterWatchError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+  expect_aio_watch(mock_image_ctx, -ESHUTDOWN);
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  assert(m_watch_ctx != nullptr);
+  m_watch_ctx->handle_error(0, -ESHUTDOWN);
+
+  // wait for recovery unwatch/watch
+  ASSERT_TRUE(wait_for_watch(mock_image_ctx, 4));
+
+  C_SaferCond unregister_ctx;
+  mock_image_watcher.unregister_watch(&unregister_ctx);
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, ReregisterUnwatchPendingUnregister) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+
+  // inject an unregister
+  C_SaferCond unregister_ctx;
+  expect_aio_unwatch(mock_image_ctx, 0, [&mock_image_watcher, &unregister_ctx]() {
+      mock_image_watcher.unregister_watch(&unregister_ctx);
+    });
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  assert(m_watch_ctx != nullptr);
+  m_watch_ctx->handle_error(0, -ESHUTDOWN);
+
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, ReregisterWatchPendingUnregister) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  // inject an unregister
+  C_SaferCond unregister_ctx;
+  expect_aio_watch(mock_image_ctx, -ESHUTDOWN,
+                   [&mock_image_watcher, &unregister_ctx]() {
+      mock_image_watcher.unregister_watch(&unregister_ctx);
+    });
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  assert(m_watch_ctx != nullptr);
+  m_watch_ctx->handle_error(0, -ESHUTDOWN);
+
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+TEST_F(TestMockObjectWatcher, ReregisterPendingUnregister) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockObjectWatcher mock_image_watcher(mock_image_ctx, m_oid);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_aio_watch(mock_image_ctx, 0);
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  // inject an unregister
+  C_SaferCond unregister_ctx;
+  expect_aio_watch(mock_image_ctx, 0,
+                   [&mock_image_watcher, &unregister_ctx]() {
+      mock_image_watcher.unregister_watch(&unregister_ctx);
+    });
+
+  expect_aio_unwatch(mock_image_ctx, 0);
+
+  C_SaferCond register_ctx;
+  mock_image_watcher.register_watch(&register_ctx);
+  ASSERT_EQ(0, register_ctx.wait());
+
+  assert(m_watch_ctx != nullptr);
+  m_watch_ctx->handle_error(0, -ESHUTDOWN);
+
+  ASSERT_EQ(0, unregister_ctx.wait());
+}
+
+} // namespace librbd
diff --git a/src/test/librgw_file_aw.cc b/src/test/librgw_file_aw.cc
index 2ae73b8..2c047a3 100644
--- a/src/test/librgw_file_aw.cc
+++ b/src/test/librgw_file_aw.cc
@@ -32,11 +32,15 @@
 
 namespace {
   librgw_t rgw = nullptr;
-  string uid("testuser");
+  string userid("testuser");
   string access_key("");
   string secret_key("");
   struct rgw_fs *fs = nullptr;
 
+  uint32_t owner_uid = 867;
+  uint32_t owner_gid = 5309;
+  uint32_t create_mask = RGW_SETATTR_UID | RGW_SETATTR_GID | RGW_SETATTR_MODE;
+
   bool do_create = false;
   bool do_delete = false;
   bool do_verify = false;
@@ -169,8 +173,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
-		      &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
+		      secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
@@ -179,8 +183,13 @@ TEST(LibRGW, CREATE_BUCKET) {
   if (do_create) {
     struct stat st;
     struct rgw_file_handle *fh;
-    int ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), 755, &st, &fh,
-			RGW_MKDIR_FLAG_NONE);
+
+    st.st_uid = owner_uid;
+    st.st_gid = owner_gid;
+    st.st_mode = 755;
+
+    int ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), &st, create_mask,
+			&fh, RGW_MKDIR_FLAG_NONE);
     ASSERT_EQ(ret, 0);
   }
 }
@@ -314,12 +323,18 @@ int main(int argc, char *argv[])
     } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
 				     (char*) nullptr)) {
       secret_key = val;
-    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--userid",
 				     (char*) nullptr)) {
-      uid = val;
+      userid = val;
     } else if (ceph_argparse_witharg(args, arg_iter, &val, "--bn",
 				     (char*) nullptr)) {
       bucket_name = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+				     (char*) nullptr)) {
+      owner_uid = std::stoi(val);
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--gid",
+				     (char*) nullptr)) {
+      owner_gid = std::stoi(val);
     } else if (ceph_argparse_flag(args, arg_iter, "--verify",
 					    (char*) nullptr)) {
       do_verify = true;
diff --git a/src/test/librgw_file_cd.cc b/src/test/librgw_file_cd.cc
index 4ec671f..0dd2a4d 100644
--- a/src/test/librgw_file_cd.cc
+++ b/src/test/librgw_file_cd.cc
@@ -28,11 +28,15 @@
 
 namespace {
   librgw_t rgw = nullptr;
-  string uid("testuser");
+  string userid("testuser");
   string access_key("");
   string secret_key("");
   struct rgw_fs *fs = nullptr;
 
+  uint32_t owner_uid = 867;
+  uint32_t owner_gid = 5309;
+  uint32_t create_mask = RGW_SETATTR_UID | RGW_SETATTR_GID | RGW_SETATTR_MODE;
+
   bool do_create = false;
   bool do_delete = false;
   bool do_multi = false;
@@ -53,8 +57,8 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
-		      &fs, RGW_MOUNT_FLAG_NONE);
+  int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
+		      secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
 }
@@ -63,8 +67,13 @@ TEST(LibRGW, CREATE_BUCKET) {
   if (do_create) {
     struct stat st;
     struct rgw_file_handle *fh;
-    int ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), 755, &st, &fh,
-			RGW_MKDIR_FLAG_NONE);
+
+    st.st_uid = owner_uid;
+    st.st_gid = owner_gid;
+    st.st_mode = 755;
+
+    int ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), &st, create_mask,
+			&fh, RGW_MKDIR_FLAG_NONE);
     ASSERT_EQ(ret, 0);
   }
 }
@@ -82,10 +91,15 @@ TEST(LibRGW, CREATE_BUCKET_MULTI) {
     int ret;
     struct stat st;
     struct rgw_file_handle *fh;
+
+    st.st_uid = owner_uid;
+    st.st_gid = owner_gid;
+    st.st_mode = 755;
+
     for (int ix = 0; ix < multi_cnt; ++ix) {
       string bn = bucket_name;
       bn += to_string(ix);
-      ret = rgw_mkdir(fs, fs->root_fh, bn.c_str(), 755, &st, &fh,
+      ret = rgw_mkdir(fs, fs->root_fh, bn.c_str(), &st, create_mask, &fh,
 		      RGW_MKDIR_FLAG_NONE);
       ASSERT_EQ(ret, 0);
       std::cout << "created: " << bn << std::endl;
@@ -147,12 +161,18 @@ int main(int argc, char *argv[])
     } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
 				     (char*) nullptr)) {
       secret_key = val;
-    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--userid",
 				     (char*) nullptr)) {
-      uid = val;
+      userid = val;
     } else if (ceph_argparse_witharg(args, arg_iter, &val, "--bn",
 				     (char*) nullptr)) {
       bucket_name = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+				     (char*) nullptr)) {
+      owner_uid = std::stoi(val);
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--gid",
+				     (char*) nullptr)) {
+      owner_gid = std::stoi(val);
     } else if (ceph_argparse_flag(args, arg_iter, "--create",
 					    (char*) nullptr)) {
       do_create = true;
diff --git a/src/test/librgw_file_nfsns.cc b/src/test/librgw_file_nfsns.cc
index 49f0ff5..b838cb1 100644
--- a/src/test/librgw_file_nfsns.cc
+++ b/src/test/librgw_file_nfsns.cc
@@ -38,12 +38,16 @@ namespace {
   using std::string;
 
   librgw_t rgw_h = nullptr;
-  string uid("testuser");
+  string userid("testuser");
   string access_key("");
   string secret_key("");
   struct rgw_fs *fs = nullptr;
   CephContext* cct = nullptr;
 
+  uint32_t owner_uid = 867;
+  uint32_t owner_gid = 5309;
+  uint32_t create_mask = RGW_SETATTR_UID | RGW_SETATTR_GID | RGW_SETATTR_MODE;
+
   string bucket_name("nfsroot");
   string dirs1_bucket_name("bdirs1");
   string readf_name("toyland");
@@ -175,7 +179,7 @@ TEST(LibRGW, INIT) {
 }
 
 TEST(LibRGW, MOUNT) {
-  int ret = rgw_mount(rgw_h, uid.c_str(), access_key.c_str(),
+  int ret = rgw_mount(rgw_h, userid.c_str(), access_key.c_str(),
 		      secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
   ASSERT_EQ(ret, 0);
   ASSERT_NE(fs, nullptr);
@@ -191,8 +195,13 @@ TEST(LibRGW, SETUP_HIER1)
     if (! bucket_fh) {
       if (do_create) {
 	struct stat st;
-	int rc = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), 755, &st,
-			   &bucket_fh, RGW_MKDIR_FLAG_NONE);
+
+	st.st_uid = owner_uid;
+	st.st_gid = owner_gid;
+	st.st_mode = 755;
+
+	int rc = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), &st,
+			  create_mask, &bucket_fh, RGW_MKDIR_FLAG_NONE);
 	ASSERT_EQ(rc, 0);
       }
     }
@@ -239,6 +248,10 @@ TEST(LibRGW, SETUP_DIRS1) {
     int rc;
     struct stat st;
 
+    st.st_uid = owner_uid;
+    st.st_gid = owner_gid;
+    st.st_mode = 755;
+
     dirs1_b.parent_fh = fs->root_fh;
 
     (void) rgw_lookup(fs, dirs1_b.parent_fh, dirs1_bucket_name.c_str(),
@@ -246,8 +259,8 @@ TEST(LibRGW, SETUP_DIRS1) {
 
     if (! dirs1_b.fh) {
       if (do_create) {
-	rc = rgw_mkdir(fs, dirs1_b.parent_fh, dirs1_b.name.c_str(), 755, &st,
-		       &dirs1_b.fh, RGW_MKDIR_FLAG_NONE);
+	rc = rgw_mkdir(fs, dirs1_b.parent_fh, dirs1_b.name.c_str(), &st,
+		      create_mask, &dirs1_b.fh, RGW_MKDIR_FLAG_NONE);
 	ASSERT_EQ(rc, 0);
       }
     }
@@ -266,7 +279,7 @@ TEST(LibRGW, SETUP_DIRS1) {
 			RGW_LOOKUP_FLAG_NONE);
       if (! dir.fh) {
 	if (do_create) {
-	  rc = rgw_mkdir(fs, dir.parent_fh, dir.name.c_str(), 755, &st,
+	  rc = rgw_mkdir(fs, dir.parent_fh, dir.name.c_str(), &st, create_mask,
 			 &dir.fh, RGW_MKDIR_FLAG_NONE);
 	  ASSERT_EQ(rc, 0);
 	}
@@ -289,8 +302,8 @@ TEST(LibRGW, SETUP_DIRS1) {
 
 	if (! sdir.fh) {
 	  if (do_create) {
-	    rc = rgw_mkdir(fs, sdir.parent_fh, sdir.name.c_str(), 755,
-			   &st, &sdir.fh, RGW_MKDIR_FLAG_NONE);
+	    rc = rgw_mkdir(fs, sdir.parent_fh, sdir.name.c_str(), &st,
+			  create_mask, &sdir.fh, RGW_MKDIR_FLAG_NONE);
 	    ASSERT_EQ(rc, 0);
 	  }
 	}
@@ -352,6 +365,11 @@ TEST(LibRGW, RGW_CREATE_DIRS1) {
     if (do_create) {
       int rc;
       struct stat st;
+
+      st.st_uid = owner_uid;
+      st.st_gid = owner_gid;
+      st.st_mode = 644;
+
       for (auto& dirs_rec : dirs_vec) {
 	/* create 1 more file in each sdir */
 	obj_rec& dir = get<0>(dirs_rec);
@@ -360,8 +378,8 @@ TEST(LibRGW, RGW_CREATE_DIRS1) {
 	(void) rgw_lookup(fs, sf.parent_fh, sf.name.c_str(), &sf.fh,
 			  RGW_LOOKUP_FLAG_NONE);
 	if (! sf.fh) {
-	  rc = rgw_create(fs, sf.parent_fh, sf.name.c_str(), 644, &st, &sf.fh,
-			  RGW_CREATE_FLAG_NONE);
+	  rc = rgw_create(fs, sf.parent_fh, sf.name.c_str(), &st, create_mask,
+			  &sf.fh, RGW_CREATE_FLAG_NONE);
 	  ASSERT_EQ(rc, 0);
 	}
 	sf.sync();
@@ -377,6 +395,11 @@ TEST(LibRGW, RGW_SETUP_RENAME1) {
     int rc;
     struct stat st;
     obj_vec ovec;
+
+    st.st_uid = owner_uid;
+    st.st_gid = owner_gid;
+    st.st_mode = 755;
+
     for (int b_ix : {0, 1}) {
       std::string bname{"brename_" + to_string(b_ix)};
       obj_rec brec{bname, nullptr, nullptr, nullptr};
@@ -385,13 +408,16 @@ TEST(LibRGW, RGW_SETUP_RENAME1) {
       if (! brec.fh) {
 	if (do_create) {
 	  struct stat st;
-	  int rc = rgw_mkdir(fs, fs->root_fh, brec.name.c_str(), 755, &st,
-			     &brec.fh, RGW_MKDIR_FLAG_NONE);
+	  int rc = rgw_mkdir(fs, fs->root_fh, brec.name.c_str(), &st,
+			    create_mask, &brec.fh, RGW_MKDIR_FLAG_NONE);
 	  ASSERT_EQ(rc, 0);
 	}
       }
       ASSERT_NE(brec.fh, nullptr);
       brec.sync();
+
+      st.st_mode = 644; /* file mask */
+
       for (int f_ix : {0, 1}) {
 	std::string rfname{"rfile_"};
 	rfname += to_string(f_ix);
@@ -399,8 +425,8 @@ TEST(LibRGW, RGW_SETUP_RENAME1) {
 	(void) rgw_lookup(fs, rf.parent_fh, rf.name.c_str(), &rf.fh,
 			  RGW_LOOKUP_FLAG_NONE);
 	if (! rf.fh) {
-	  rc = rgw_create(fs, rf.parent_fh, rf.name.c_str(), 644, &st, &rf.fh,
-			  RGW_CREATE_FLAG_NONE);
+	  rc = rgw_create(fs, rf.parent_fh, rf.name.c_str(), &st, create_mask,
+			  &rf.fh, RGW_CREATE_FLAG_NONE);
 	  ASSERT_EQ(rc, 0);
 	}
 	rf.sync();
@@ -796,9 +822,13 @@ TEST(LibRGW, MARKER1_SETUP_BUCKET) {
     struct stat st;
     int ret;
 
+    st.st_uid = owner_uid;
+    st.st_gid = owner_gid;
+    st.st_mode = 755;
+
     if (do_create) {
-      ret = rgw_mkdir(fs, bucket_fh, marker_dir.c_str(), 755, &st, &marker_fh,
-		      RGW_MKDIR_FLAG_NONE);
+      ret = rgw_mkdir(fs, bucket_fh, marker_dir.c_str(), &st, create_mask,
+		      &marker_fh, RGW_MKDIR_FLAG_NONE);
     } else {
       ret = rgw_lookup(fs, bucket_fh, marker_dir.c_str(), &marker_fh,
 		       RGW_LOOKUP_FLAG_NONE);
@@ -966,12 +996,18 @@ int main(int argc, char *argv[])
     } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
 				     (char*) nullptr)) {
       secret_key = val;
-    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--userid",
 				     (char*) nullptr)) {
-      uid = val;
+      userid = val;
     } else if (ceph_argparse_witharg(args, arg_iter, &val, "--bn",
 				     (char*) nullptr)) {
       bucket_name = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+				     (char*) nullptr)) {
+      owner_uid = std::stoi(val);
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--gid",
+				     (char*) nullptr)) {
+      owner_gid = std::stoi(val);
     } else if (ceph_argparse_flag(args, arg_iter, "--hier1",
 					    (char*) nullptr)) {
       do_hier1 = true;
diff --git a/src/test/mon/test_pool_quota.sh b/src/test/mon/test_pool_quota.sh
new file mode 100755
index 0000000..fd3879d
--- /dev/null
+++ b/src/test/mon/test_pool_quota.sh
@@ -0,0 +1,61 @@
+#!/bin/bash 
+
+#
+# Generic pool quota test
+#
+
+# Includes
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:17108"
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        $func $dir || return 1
+    done
+}
+
+function TEST_pool_quota() {
+    local dir=$1
+    setup $dir || return 1
+ 
+    run_mon $dir a || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+   
+    local poolname=testquoa
+    ceph osd  pool create $poolname 20
+    local objects=`ceph df detail | grep -w $poolname|awk '{print $4}'`
+    local bytes=`ceph df detail | grep -w $poolname|awk '{print $5}'`
+
+    echo $objects
+    echo $bytes
+    if [ $objects != 'N/A' ] || [ $bytes != 'N/A' ] ;
+      then
+      return 1
+    fi
+
+    ceph osd pool set-quota  $poolname   max_objects 1000
+    ceph osd pool set-quota  $poolname  max_bytes 1024
+
+    objects=`ceph df detail | grep -w $poolname|awk '{print $4}'`
+    bytes=`ceph df detail | grep -w $poolname|awk '{print $5}'`
+   
+     if [ $objects != '1000' ] || [ $bytes != '1024' ] ;
+       then
+       return 1
+     fi
+
+     ceph osd pool delete  $poolname $poolname  --yes-i-really-really-mean-it
+     teardown $dir || return 1
+}
+
+main testpoolquota
diff --git a/src/test/msgr/perf_msgr_client.cc b/src/test/msgr/perf_msgr_client.cc
index fa21b49..1cb3db2 100644
--- a/src/test/msgr/perf_msgr_client.cc
+++ b/src/test/msgr/perf_msgr_client.cc
@@ -196,6 +196,7 @@ int main(int argc, char **argv)
   cerr << "       message data bytes " << len << std::endl;
   MessengerClient client(g_ceph_context->_conf->ms_type, args[0], think_time);
   client.ready(concurrent, numjobs, ios, len);
+  Cycles::init();
   uint64_t start = Cycles::rdtsc();
   client.start();
   uint64_t stop = Cycles::rdtsc();
diff --git a/src/test/msgr/perf_msgr_server.cc b/src/test/msgr/perf_msgr_server.cc
index 3eae9a5..dde679c 100644
--- a/src/test/msgr/perf_msgr_server.cc
+++ b/src/test/msgr/perf_msgr_server.cc
@@ -25,7 +25,6 @@ using namespace std;
 #include "include/atomic.h"
 #include "common/ceph_argparse.h"
 #include "common/debug.h"
-#include "common/Cycles.h"
 #include "global/global_init.h"
 #include "msg/Messenger.h"
 #include "messages/MOSDOp.h"
diff --git a/src/test/objectstore/ObjectStoreTransactionBenchmark.cc b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
index 7c0dc09..46fd25d 100644
--- a/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
+++ b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
@@ -249,6 +249,7 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
   g_ceph_context->_conf->apply_changes(NULL);
+  Cycles::init();
 
   cerr << "args: " << args << std::endl;
   if (args.size() < 1) {
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index e1f01cd..9a5de2e 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -37,6 +37,20 @@ typedef boost::mt11213b gen_type;
 
 #if GTEST_HAS_PARAM_TEST
 
+template <typename T>
+int apply_transaction(
+  T &store,
+  ObjectStore::Sequencer *osr,
+  ObjectStore::Transaction &&t) {
+  if (rand() % 2) {
+    ObjectStore::Transaction t2;
+    t2.append(t);
+    return store->apply_transaction(osr, std::move(t2));
+  } else {
+    return store->apply_transaction(osr, std::move(t));
+  }
+}
+
 class StoreTest : public ::testing::TestWithParam<const char*> {
 public:
   boost::scoped_ptr<ObjectStore> store;
@@ -129,7 +143,7 @@ TEST_P(StoreTest, SimpleRemount) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     t.write(cid, hoid, 0, bl.length(), bl);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   store->umount();
@@ -138,7 +152,7 @@ TEST_P(StoreTest, SimpleRemount) {
   {
     ObjectStore::Transaction t;
     t.write(cid, hoid2, 0, bl.length(), bl);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -147,7 +161,7 @@ TEST_P(StoreTest, SimpleRemount) {
     t.remove(cid, hoid2);
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   store->umount();
@@ -156,7 +170,7 @@ TEST_P(StoreTest, SimpleRemount) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
     bool exists = store->exists(cid, hoid);
     ASSERT_TRUE(!exists);
@@ -165,7 +179,7 @@ TEST_P(StoreTest, SimpleRemount) {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -184,7 +198,7 @@ TEST_P(StoreTest, IORemount) {
       ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
       t.write(cid, hoid, 0, bl.length(), bl);
     }
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   // overwrites
@@ -194,7 +208,7 @@ TEST_P(StoreTest, IORemount) {
       ObjectStore::Transaction t;
       ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
       t.write(cid, hoid, 1, bl.length(), bl);
-      r = store->apply_transaction(&osr, std::move(t));
+      r = apply_transaction(store, &osr, std::move(t));
       ASSERT_EQ(r, 0);
     }
   }
@@ -208,7 +222,7 @@ TEST_P(StoreTest, IORemount) {
       t.remove(cid, hoid);
     }
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -223,7 +237,7 @@ TEST_P(StoreTest, FiemapEmpty) {
     t.create_collection(cid, 0);
     t.touch(cid, oid);
     t.truncate(cid, oid, 100000);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -241,7 +255,7 @@ TEST_P(StoreTest, FiemapEmpty) {
     t.remove(cid, oid);
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -260,7 +274,7 @@ TEST_P(StoreTest, FiemapHoles) {
     t.write(cid, oid, 0, 3, bl);
     t.write(cid, oid, 1048576, 3, bl);
     t.write(cid, oid, 4194304, 3, bl);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -284,7 +298,7 @@ TEST_P(StoreTest, FiemapHoles) {
     t.remove(cid, oid);
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -297,28 +311,28 @@ TEST_P(StoreTest, SimpleMetaColTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "create collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "add collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -331,28 +345,28 @@ TEST_P(StoreTest, SimplePGColTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 4);
     cerr << "create collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 4);
     cerr << "add collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -390,7 +404,7 @@ TEST_P(StoreTest, SimpleColPreHashTest) {
     ::encode(expected_num_objs, hint);
     t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
     cerr << "collection hint" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -398,7 +412,7 @@ TEST_P(StoreTest, SimpleColPreHashTest) {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   // Revert the config change so that it does not affect the split/merge tests
@@ -423,7 +437,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -433,7 +447,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
     cerr << "Creating object " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     exists = store->exists(cid, hoid);
@@ -444,7 +458,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     t.remove(cid, hoid);
     t.touch(cid, hoid);
     cerr << "Remove then create" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -455,7 +469,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     t.remove(cid, hoid);
     t.write(cid, hoid, 0, 5, bl);
     cerr << "Remove then create" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     bufferlist in;
@@ -471,7 +485,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     exp.append(bl);
     t.write(cid, hoid, 5, 5, bl);
     cerr << "Append" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     bufferlist in;
@@ -486,7 +500,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     exp = bl;
     t.write(cid, hoid, 0, 10, bl);
     cerr << "Full overwrite" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     bufferlist in;
@@ -500,7 +514,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     bl.append("abcde");
     t.write(cid, hoid, 3, 5, bl);
     cerr << "Partial overwrite" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     bufferlist in, exp;
@@ -516,7 +530,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234");
     t.write(cid, hoid, 0, bl.length(), bl);
     cerr << "larger overwrite" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     bufferlist in;
@@ -541,7 +555,7 @@ TEST_P(StoreTest, SimpleObjectTest) {
     t.remove(cid, hoid);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -556,7 +570,7 @@ TEST_P(StoreTest, ManySmallWrite) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   bufferlist bl;
@@ -566,13 +580,13 @@ TEST_P(StoreTest, ManySmallWrite) {
   for (int i=0; i<100; ++i) {
     ObjectStore::Transaction t;
     t.write(cid, a, i*4096, 4096, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   for (int i=0; i<100; ++i) {
     ObjectStore::Transaction t;
     t.write(cid, b, (rand() % 1024)*4096, 4096, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -581,7 +595,7 @@ TEST_P(StoreTest, ManySmallWrite) {
     t.remove(cid, b);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -595,14 +609,14 @@ TEST_P(StoreTest, SmallSkipFront) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.touch(cid, a);
     t.truncate(cid, a, 3000);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -612,7 +626,7 @@ TEST_P(StoreTest, SmallSkipFront) {
     bl.append(bp);
     ObjectStore::Transaction t;
     t.write(cid, a, 4096, 4096, bl);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -628,6 +642,163 @@ TEST_P(StoreTest, SmallSkipFront) {
     t.remove(cid, a);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, AppendWalVsTailCache) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  coll_t cid;
+  ghobject_t a(hobject_t(sobject_t("fooo", CEPH_NOSNAP)));
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  unsigned min_alloc = g_conf->bluestore_min_alloc_size;
+  g_conf->set_val("bluestore_inject_wal_apply_delay", "1.0");
+  g_ceph_context->_conf->apply_changes(NULL);
+  unsigned size = min_alloc / 3;
+  bufferptr bpa(size);
+  memset(bpa.c_str(), 1, bpa.length());
+  bufferlist bla;
+  bla.append(bpa);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, a, 0, bla.length(), bla, 0);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  // force cached tail to clear ...
+  {
+    store->umount();
+    int r = store->mount();
+    ASSERT_EQ(0, r);
+  }
+
+  bufferptr bpb(size);
+  memset(bpb.c_str(), 2, bpb.length());
+  bufferlist blb;
+  blb.append(bpb);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, a, bla.length(), blb.length(), blb, 0);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  bufferptr bpc(size);
+  memset(bpc.c_str(), 3, bpc.length());
+  bufferlist blc;
+  blc.append(bpc);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, a, bla.length() + blb.length(), blc.length(), blc, 0);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  bufferlist final;
+  final.append(bla);
+  final.append(blb);
+  final.append(blc);
+  bufferlist actual;
+  {
+    ASSERT_EQ((int)final.length(),
+	      store->read(cid, a, 0, final.length(), actual));
+    ASSERT_TRUE(final.contents_equal(actual));
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, a);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  g_conf->set_val("bluestore_inject_wal_apply_delay", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
+}
+
+TEST_P(StoreTest, AppendZeroTrailingSharedBlock) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  coll_t cid;
+  ghobject_t a(hobject_t(sobject_t("fooo", CEPH_NOSNAP)));
+  ghobject_t b = a;
+  b.hobj.snap = 1;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  unsigned min_alloc = g_conf->bluestore_min_alloc_size;
+  unsigned size = min_alloc / 3;
+  bufferptr bpa(size);
+  memset(bpa.c_str(), 1, bpa.length());
+  bufferlist bla;
+  bla.append(bpa);
+  // make sure there is some trailing gunk in the last block
+  {
+    bufferlist bt;
+    bt.append(bla);
+    bt.append("BADBADBADBAD");
+    ObjectStore::Transaction t;
+    t.write(cid, a, 0, bt.length(), bt, 0);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.truncate(cid, a, size);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  // clone
+  {
+    ObjectStore::Transaction t;
+    t.clone(cid, a, b);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  // append with implicit zeroing
+  bufferptr bpb(size);
+  memset(bpb.c_str(), 2, bpb.length());
+  bufferlist blb;
+  blb.append(bpb);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, a, min_alloc * 3, blb.length(), blb, 0);
+    r = store->apply_transaction(&osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  bufferlist final;
+  final.append(bla);
+  bufferlist zeros;
+  zeros.append_zero(min_alloc * 3 - size);
+  final.append(zeros);
+  final.append(blb);
+  bufferlist actual;
+  {
+    ASSERT_EQ((int)final.length(),
+	      store->read(cid, a, 0, final.length(), actual));
+    final.hexdump(cout);
+    actual.hexdump(cout);
+    ASSERT_TRUE(final.contents_equal(actual));
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, a);
+    t.remove(cid, b);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
     r = store->apply_transaction(&osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
@@ -642,7 +813,7 @@ TEST_P(StoreTest, SmallSequentialUnaligned) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   bufferlist bl;
@@ -653,7 +824,7 @@ TEST_P(StoreTest, SmallSequentialUnaligned) {
   for (int i=0; i<1000; ++i) {
     ObjectStore::Transaction t;
     t.write(cid, a, i*len, len, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -661,7 +832,7 @@ TEST_P(StoreTest, SmallSequentialUnaligned) {
     t.remove(cid, a);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -676,7 +847,7 @@ TEST_P(StoreTest, ManyBigWrite) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   bufferlist bl;
@@ -686,28 +857,28 @@ TEST_P(StoreTest, ManyBigWrite) {
   for (int i=0; i<10; ++i) {
     ObjectStore::Transaction t;
     t.write(cid, a, i*4*1048586, 4*1048576, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   // aligned
   for (int i=0; i<10; ++i) {
     ObjectStore::Transaction t;
     t.write(cid, b, (rand() % 256)*4*1048576, 4*1048576, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   // unaligned
   for (int i=0; i<10; ++i) {
     ObjectStore::Transaction t;
     t.write(cid, b, (rand() % (256*4096))*1024, 4*1048576, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   // do some zeros
   for (int i=0; i<10; ++i) {
     ObjectStore::Transaction t;
     t.zero(cid, b, (rand() % (256*4096))*1024, 16*1048576);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -716,7 +887,7 @@ TEST_P(StoreTest, ManyBigWrite) {
     t.remove(cid, b);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -730,7 +901,7 @@ TEST_P(StoreTest, MiscFragmentTests) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   bufferlist bl;
@@ -740,13 +911,13 @@ TEST_P(StoreTest, MiscFragmentTests) {
   {
     ObjectStore::Transaction t;
     t.write(cid, a, 0, 524288, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.write(cid, a, 1048576, 524288, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -759,7 +930,7 @@ TEST_P(StoreTest, MiscFragmentTests) {
   {
     ObjectStore::Transaction t;
     t.write(cid, a, 1048576 - 4096, 524288, bl, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -767,7 +938,7 @@ TEST_P(StoreTest, MiscFragmentTests) {
     t.remove(cid, a);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -792,7 +963,7 @@ TEST_P(StoreTest, SimpleAttrTest) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -809,7 +980,7 @@ TEST_P(StoreTest, SimpleAttrTest) {
     t.touch(cid, hoid);
     t.setattr(cid, hoid, "foo", val);
     t.setattr(cid, hoid, "bar", val2);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -836,7 +1007,7 @@ TEST_P(StoreTest, SimpleAttrTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -849,7 +1020,7 @@ TEST_P(StoreTest, SimpleListTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   set<ghobject_t, ghobject_t::BitwiseComparator> all;
@@ -865,7 +1036,7 @@ TEST_P(StoreTest, SimpleListTest) {
       t.touch(cid, hoid);
       cerr << "Creating object " << hoid << std::endl;
     }
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   for (int bitwise=0; bitwise<2; ++bitwise) {
@@ -904,7 +1075,7 @@ TEST_P(StoreTest, SimpleListTest) {
       t.remove(cid, *p);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -946,7 +1117,7 @@ TEST_P(StoreTest, MultipoolListTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   set<ghobject_t, ghobject_t::BitwiseComparator> all, saw;
@@ -964,7 +1135,7 @@ TEST_P(StoreTest, MultipoolListTest) {
       t.touch(cid, hoid);
       cerr << "Creating object " << hoid << std::endl;
     }
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -991,7 +1162,7 @@ TEST_P(StoreTest, MultipoolListTest) {
       t.remove(cid, *p);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1004,7 +1175,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP),
@@ -1022,7 +1193,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     t.write(cid, hoid, 0, small.length(), small);
     t.write(cid, hoid, 10, small.length(), small);
     cerr << "Creating object and set attr " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP),
@@ -1036,7 +1207,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     t.setattr(cid, hoid, "attr1", large);
     t.setattr(cid, hoid, "attr2", small);
     cerr << "Clone object and rm attr" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
     r = store->read(cid, hoid, 10, 5, newdata);
@@ -1071,7 +1242,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
   }
   {
     bufferlist final;
@@ -1089,7 +1260,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     al.append(a);
     final.append(a);
     t.write(cid, hoid, pl.length(), a.length(), al);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
     bufferlist rl;
     ASSERT_EQ((int)final.length(),
@@ -1100,7 +1271,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
   }
   {
     bufferlist final;
@@ -1121,7 +1292,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     al.append(a);
     final.append(a);
     t.write(cid, hoid, pl.length() + z.length(), a.length(), al);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
     bufferlist rl;
     ASSERT_EQ((int)final.length(),
@@ -1132,7 +1303,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
   }
   {
     bufferlist final;
@@ -1153,7 +1324,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     al.append(a);
     final.append(a);
     t.write(cid, hoid, 17000, a.length(), al);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
     bufferlist rl;
     ASSERT_EQ((int)final.length(),
 	      store->read(cid, hoid, 0, final.length(), rl));
@@ -1167,7 +1338,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
   }
   {
     bufferptr p(1048576);
@@ -1182,7 +1353,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     bufferlist al;
     al.append(a);
     t.write(cid, hoid, a.length(), a.length(), al);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
     bufferlist rl;
     bufferlist final;
     final.substr_of(pl, 0, al.length());
@@ -1202,7 +1373,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
   }
   {
     bufferptr p(65536);
@@ -1217,7 +1388,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     bufferlist al;
     al.append(a);
     t.write(cid, hoid, 32768, a.length(), al);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
     bufferlist rl;
     bufferlist final;
     final.substr_of(pl, 0, 32768);
@@ -1237,7 +1408,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
   }
   {
     bufferptr p(65536);
@@ -1252,7 +1423,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     bufferlist al;
     al.append(a);
     t.write(cid, hoid, 33768, a.length(), al);
-    ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t)));
+    ASSERT_EQ(0, apply_transaction(store, &osr, std::move(t)));
     bufferlist rl;
     bufferlist final;
     final.substr_of(pl, 0, 33768);
@@ -1274,7 +1445,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     t.remove(cid, hoid2);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1287,7 +1458,7 @@ TEST_P(StoreTest, OmapSimple) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("omap_obj", CEPH_NOSNAP),
@@ -1305,7 +1476,7 @@ TEST_P(StoreTest, OmapSimple) {
     t.omap_setkeys(cid, hoid, km);
     t.omap_setheader(cid, hoid, header);
     cerr << "Creating object and set omap " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   // get header, keys
@@ -1342,7 +1513,7 @@ TEST_P(StoreTest, OmapSimple) {
     t.remove(cid, hoid);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1355,7 +1526,7 @@ TEST_P(StoreTest, OmapCloneTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP),
@@ -1373,7 +1544,7 @@ TEST_P(StoreTest, OmapCloneTest) {
     t.omap_setkeys(cid, hoid, km);
     t.omap_setheader(cid, hoid, header);
     cerr << "Creating object and set omap " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP),
@@ -1382,7 +1553,7 @@ TEST_P(StoreTest, OmapCloneTest) {
     ObjectStore::Transaction t;
     t.clone(cid, hoid, hoid2);
     cerr << "Clone object" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -1398,7 +1569,7 @@ TEST_P(StoreTest, OmapCloneTest) {
     t.remove(cid, hoid2);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1411,7 +1582,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
@@ -1422,7 +1593,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     ObjectStore::Transaction t;
     t.write(cid, hoid, 10, 5, small);
     cerr << "Creating object and write bl " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
@@ -1431,7 +1602,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     ObjectStore::Transaction t;
     t.clone_range(cid, hoid, hoid2, 10, 5, 0);
     cerr << "Clone range object" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
     r = store->read(cid, hoid2, 0, 5, newdata);
     ASSERT_EQ(r, 5);
@@ -1442,7 +1613,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     t.truncate(cid, hoid, 1024*1024);
     t.clone_range(cid, hoid, hoid2, 0, 1024*1024, 0);
     cerr << "Clone range object" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
     struct stat stat, stat2;
     r = store->stat(cid, hoid, &stat);
@@ -1456,7 +1627,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     t.remove(cid, hoid2);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1470,7 +1641,7 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP)));
@@ -1478,7 +1649,7 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) {
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
     cerr << "Creating object " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -1486,7 +1657,7 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) {
     t.remove(cid, hoid);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1509,7 +1680,7 @@ TEST_P(StoreTest, LongnameSplitTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   for (unsigned i = 0; i < 320; ++i) {
@@ -1517,7 +1688,7 @@ TEST_P(StoreTest, LongnameSplitTest) {
     ghobject_t hoid = generate_long_name(i);
     t.touch(cid, hoid);
     cerr << "Creating object " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
   }
 
   ghobject_t test_obj = generate_long_name(319);
@@ -1529,7 +1700,7 @@ TEST_P(StoreTest, LongnameSplitTest) {
     t.collection_move_rename(
       cid, test_obj,
       cid, test_obj_2);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
   }
 
   for (unsigned i = 0; i < 319; ++i) {
@@ -1537,14 +1708,14 @@ TEST_P(StoreTest, LongnameSplitTest) {
     ghobject_t hoid = generate_long_name(i);
     t.remove(cid, hoid);
     cerr << "Removing object " << hoid << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
   }
   {
     ObjectStore::Transaction t;
     t.remove(cid, test_obj_2);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -1561,7 +1732,7 @@ TEST_P(StoreTest, ManyObjectTest) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   for (int i = 0; i < NUM_OBJS; ++i) {
@@ -1574,7 +1745,7 @@ TEST_P(StoreTest, ManyObjectTest) {
     ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP)));
     t.touch(cid, hoid);
     created.insert(hoid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -1664,14 +1835,14 @@ TEST_P(StoreTest, ManyObjectTest) {
        ++i) {
     ObjectStore::Transaction t;
     t.remove(cid, *i);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   cerr << "cleaning up" << std::endl;
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -1880,7 +2051,7 @@ public:
   int init() {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    return store->apply_transaction(osr, std::move(t));
+    return apply_transaction(store, osr, std::move(t));
   }
   void shutdown() {
     while (1) {
@@ -1895,11 +2066,11 @@ public:
 	   p != objects.end(); ++p) {
 	t.remove(cid, *p);
       }
-      store->apply_transaction(osr, std::move(t));
+      apply_transaction(store, osr, std::move(t));
     }
     ObjectStore::Transaction t;
     t.remove_collection(cid);
-    store->apply_transaction(osr, std::move(t));
+    apply_transaction(store, osr, std::move(t));
   }
 
   ghobject_t get_uniform_random_object() {
@@ -2534,7 +2705,7 @@ TEST_P(StoreTest, HashCollisionTest) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   string base = "";
@@ -2553,7 +2724,7 @@ TEST_P(StoreTest, HashCollisionTest) {
     {
       ObjectStore::Transaction t;
       t.touch(cid, hoid);
-      r = store->apply_transaction(&osr, std::move(t));
+      r = apply_transaction(store, &osr, std::move(t));
       ASSERT_EQ(r, 0);
     }
     created.insert(hoid);
@@ -2600,12 +2771,12 @@ TEST_P(StoreTest, HashCollisionTest) {
        ++i) {
     ObjectStore::Transaction t;
     t.remove(cid, *i);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ObjectStore::Transaction t;
   t.remove_collection(cid);
-  r = store->apply_transaction(&osr, std::move(t));
+  r = apply_transaction(store, &osr, std::move(t));
   ASSERT_EQ(r, 0);
 }
 
@@ -2617,7 +2788,7 @@ TEST_P(StoreTest, ScrubTest) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   string base = "aaaaa";
@@ -2634,7 +2805,7 @@ TEST_P(StoreTest, ScrubTest) {
     {
       ObjectStore::Transaction t;
       t.touch(cid, hoid);
-      r = store->apply_transaction(&osr, std::move(t));
+      r = apply_transaction(store, &osr, std::move(t));
       ASSERT_EQ(r, 0);
     }
     created.insert(hoid);
@@ -2650,7 +2821,7 @@ TEST_P(StoreTest, ScrubTest) {
     t.touch(cid, hoid1);
     t.touch(cid, hoid2);
     t.touch(cid, hoid3);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     created.insert(hoid1);
     created.insert(hoid2);
     created.insert(hoid3);
@@ -2698,12 +2869,12 @@ TEST_P(StoreTest, ScrubTest) {
        ++i) {
     ObjectStore::Transaction t;
     t.remove(cid, *i);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ObjectStore::Transaction t;
   t.remove_collection(cid);
-  r = store->apply_transaction(&osr, std::move(t));
+  r = apply_transaction(store, &osr, std::move(t));
   ASSERT_EQ(r, 0);
 }
 
@@ -2716,7 +2887,7 @@ TEST_P(StoreTest, OMapTest) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -2727,7 +2898,7 @@ TEST_P(StoreTest, OMapTest) {
     t.omap_clear(cid, hoid);
     map<string, bufferlist> start_set;
     t.omap_setkeys(cid, hoid, start_set);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
   }
 
   for (int i = 0; i < 100; i++) {
@@ -2762,7 +2933,7 @@ TEST_P(StoreTest, OMapTest) {
     to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     t.omap_setkeys(cid, hoid, to_add);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
   }
 
   int i = 0;
@@ -2792,7 +2963,7 @@ TEST_P(StoreTest, OMapTest) {
     set<string> keys_to_remove;
     keys_to_remove.insert(to_remove);
     t.omap_rmkeys(cid, hoid, keys_to_remove);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
 
     attrs.erase(to_remove);
 
@@ -2804,7 +2975,7 @@ TEST_P(StoreTest, OMapTest) {
     bl1.append("omap_header");
     ObjectStore::Transaction t;
     t.omap_setheader(cid, hoid, bl1);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
     t = ObjectStore::Transaction();
  
     bufferlist bl2;
@@ -2812,7 +2983,7 @@ TEST_P(StoreTest, OMapTest) {
     map<string, bufferlist> to_add;
     to_add.insert(pair<string, bufferlist>("key", bl2));
     t.omap_setkeys(cid, hoid, to_add);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
 
     bufferlist bl3;
     map<string, bufferlist> cur_attrs;
@@ -2841,12 +3012,12 @@ TEST_P(StoreTest, OMapTest) {
       t.touch(cid, hoid);
       t.omap_setheader(cid, hoid, h);
       t.omap_setkeys(cid, hoid, to_set);
-      store->apply_transaction(&osr, std::move(t));
+      apply_transaction(store, &osr, std::move(t));
     }
     {
       ObjectStore::Transaction t;
       t.omap_rmkeyrange(cid, hoid, "3", "7");
-      store->apply_transaction(&osr, std::move(t));
+      apply_transaction(store, &osr, std::move(t));
     }
     {
       bufferlist hdr;
@@ -2864,7 +3035,7 @@ TEST_P(StoreTest, OMapTest) {
     {
       ObjectStore::Transaction t;
       t.omap_clear(cid, hoid);
-      store->apply_transaction(&osr, std::move(t));
+      apply_transaction(store, &osr, std::move(t));
     }
     {
       bufferlist hdr;
@@ -2878,7 +3049,7 @@ TEST_P(StoreTest, OMapTest) {
   ObjectStore::Transaction t;
   t.remove(cid, hoid);
   t.remove_collection(cid);
-  r = store->apply_transaction(&osr, std::move(t));
+  r = apply_transaction(store, &osr, std::move(t));
   ASSERT_EQ(r, 0);
 }
 
@@ -2891,7 +3062,7 @@ TEST_P(StoreTest, OMapIterator) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -2902,7 +3073,7 @@ TEST_P(StoreTest, OMapIterator) {
     t.omap_clear(cid, hoid);
     map<string, bufferlist> start_set;
     t.omap_setkeys(cid, hoid, start_set);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
   }
   ObjectMap::ObjectMapIterator iter;
   bool correct;
@@ -2945,7 +3116,7 @@ TEST_P(StoreTest, OMapIterator) {
     attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     ObjectStore::Transaction t;
     t.omap_setkeys(cid, hoid, to_add);
-    store->apply_transaction(&osr, std::move(t));
+    apply_transaction(store, &osr, std::move(t));
   }
 
   iter = store->get_omap_iterator(cid, hoid);
@@ -2971,7 +3142,7 @@ TEST_P(StoreTest, OMapIterator) {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -2993,7 +3164,7 @@ TEST_P(StoreTest, XattrTest) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     t.touch(cid, hoid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -3012,7 +3183,7 @@ TEST_P(StoreTest, XattrTest) {
     attrs["attr4"] = big;
     t.setattr(cid, hoid, "attr3", big);
     attrs["attr3"] = big;
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -3031,7 +3202,7 @@ TEST_P(StoreTest, XattrTest) {
     ObjectStore::Transaction t;
     t.rmattr(cid, hoid, "attr2");
     attrs.erase("attr2");
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -3059,7 +3230,7 @@ TEST_P(StoreTest, XattrTest) {
   ObjectStore::Transaction t;
   t.remove(cid, hoid);
   t.remove_collection(cid);
-  r = store->apply_transaction(&osr, std::move(t));
+  r = apply_transaction(store, &osr, std::move(t));
   ASSERT_EQ(r, 0);
 }
 
@@ -3075,7 +3246,7 @@ void colsplittest(
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, common_suffix_size);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
@@ -3090,14 +3261,14 @@ void colsplittest(
 	  i<<common_suffix_size,
 	  52, "")));
     }
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.create_collection(tid, common_suffix_size + 1);
     t.split_collection(cid, common_suffix_size+1, 1<<common_suffix_size, tid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -3128,7 +3299,7 @@ void colsplittest(
 
   t.remove_collection(cid);
   t.remove_collection(tid);
-  r = store->apply_transaction(&osr, std::move(t));
+  r = apply_transaction(store, &osr, std::move(t));
   ASSERT_EQ(r, 0);
 }
 
@@ -3159,7 +3330,7 @@ TEST_P(StoreTest, TwoHash) {
   {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   std::cout << "Making objects" << std::endl;
@@ -3173,7 +3344,7 @@ TEST_P(StoreTest, TwoHash) {
     }
     o.hobj.set_hash((i << 16) | 0xB1);
     t.touch(cid, o);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   std::cout << "Removing half" << std::endl;
@@ -3183,7 +3354,7 @@ TEST_P(StoreTest, TwoHash) {
     o.hobj.pool = -1;
     o.hobj.set_hash((i << 16) | 0xA1);
     t.remove(cid, o);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   std::cout << "Checking" << std::endl;
@@ -3211,12 +3382,12 @@ TEST_P(StoreTest, TwoHash) {
     t.remove(cid, o);
     o.hobj.set_hash((i << 16) | 0xB1);
     t.remove(cid, o);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ObjectStore::Transaction t;
   t.remove_collection(cid);
-  r = store->apply_transaction(&osr, std::move(t));
+  r = apply_transaction(store, &osr, std::move(t));
   ASSERT_EQ(r, 0);
 }
 
@@ -3233,7 +3404,7 @@ TEST_P(StoreTest, Rename) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     t.write(cid, srcoid, 0, a.length(), a);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, srcoid));
@@ -3243,7 +3414,7 @@ TEST_P(StoreTest, Rename) {
     t.remove(cid, srcoid);
     t.write(cid, srcoid, 0, b.length(), b);
     t.setattr(cid, srcoid, "attr", b);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, srcoid));
@@ -3261,7 +3432,7 @@ TEST_P(StoreTest, Rename) {
     t.collection_move_rename(cid, srcoid, cid, dstoid);
     t.remove(cid, srcoid);
     t.setattr(cid, srcoid, "attr", a);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, dstoid));
@@ -3275,7 +3446,7 @@ TEST_P(StoreTest, Rename) {
     t.remove(cid, dstoid);
     t.remove(cid, srcoid);
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -3290,7 +3461,7 @@ TEST_P(StoreTest, MoveRename) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     t.touch(cid, oid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, oid));
@@ -3305,7 +3476,7 @@ TEST_P(StoreTest, MoveRename) {
     t.write(cid, temp_oid, 0, data.length(), data);
     t.setattr(cid, temp_oid, "attr", attr);
     t.omap_setkeys(cid, temp_oid, omap);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, temp_oid));
@@ -3313,7 +3484,7 @@ TEST_P(StoreTest, MoveRename) {
     ObjectStore::Transaction t;
     t.remove(cid, oid);
     t.collection_move_rename(cid, temp_oid, cid, oid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, oid));
@@ -3340,7 +3511,7 @@ TEST_P(StoreTest, MoveRename) {
     ObjectStore::Transaction t;
     t.remove(cid, oid);
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 }
@@ -3371,14 +3542,14 @@ TEST_P(StoreTest, BigRGWObjectName) {
     t.collection_move_rename(cid, oidhead, cid, oid);
     t.touch(cid, oidhead);
     t.collection_move_rename(cid, oidhead, cid, oid2);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
   {
     ObjectStore::Transaction t;
     t.remove(cid, oid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
 
@@ -3397,7 +3568,7 @@ TEST_P(StoreTest, BigRGWObjectName) {
     ObjectStore::Transaction t;
     t.remove(cid, oid2);
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
 
   }
@@ -3412,33 +3583,68 @@ TEST_P(StoreTest, SetAllocHint) {
     ObjectStore::Transaction t;
     t.create_collection(cid, 0);
     t.touch(cid, hoid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
-    r = store->apply_transaction(&osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, TryMoveRename) {
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
+  ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
+  ghobject_t hoid2(hobject_t("test_hint2", "", CEPH_NOSNAP, 0, 0, ""));
+  int r;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.try_rename(cid, hoid, hoid2);
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.touch(cid, hoid);
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.try_rename(cid, hoid, hoid2);
+    r = apply_transaction(store, &osr, std::move(t));
     ASSERT_EQ(r, 0);
   }
+  struct stat st;
+  ASSERT_EQ(store->stat(cid, hoid, &st), -2);
+  ASSERT_EQ(store->stat(cid, hoid2, &st), 0);
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/src/test/opensuse-13.2/ceph.spec.in b/src/test/opensuse-13.2/ceph.spec.in
index b52d7e2..3a5a6f7 100644
--- a/src/test/opensuse-13.2/ceph.spec.in
+++ b/src/test/opensuse-13.2/ceph.spec.in
@@ -27,6 +27,10 @@
 %bcond_with selinux
 %endif
 
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
 
 %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
@@ -62,11 +66,6 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 %{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
 # unify libexec for all targets
 %global _libexecdir %{_exec_prefix}/lib
 
@@ -186,7 +185,7 @@ BuildRequires:  boost-random
 BuildRequires:	python-argparse
 %endif
 # lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:	lttng-ust-devel
 BuildRequires:	libbabeltrace-devel
@@ -685,6 +684,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
                 --libexecdir=%{_libexecdir} \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?rhel} && ! 0%{?centos}
+                --enable-subman \
+%endif
 %if 0%{?_with_systemd}
 		--with-systemdsystemunitdir=%_unitdir \
 %endif
@@ -702,6 +704,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 %endif
 		--with-librocksdb-static=check \
 		--with-radosgw \
+%if %{without lttng}
+		--without-lttng \
+		--without-babeltrace \
+%endif
 		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
 		%{?_with_tcmalloc} \
@@ -858,7 +864,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
@@ -977,7 +983,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
 %{_bindir}/rbdmap
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_bindir}/rbd-replay-prep
 %endif
 %{_bindir}/ceph-post-file
@@ -994,6 +1000,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
 %{_mandir}/man8/rbd-replay-many.8*
 %{_mandir}/man8/rbd-replay-prep.8*
@@ -1017,19 +1024,22 @@ rm -rf $RPM_BUILD_ROOT
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
 
 %pre common
-CEPH_GROUP_ID=""
-CEPH_USER_ID=""
+CEPH_GROUP_ID=167
+CEPH_USER_ID=167
 %if 0%{?rhel} || 0%{?fedora}
-CEPH_GROUP_ID="-g 167"
-CEPH_USER_ID="-u 167"
-%endif
-%if 0%{?rhel} || 0%{?fedora}
-%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
-%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%{_sbindir}/groupadd ceph -g $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph -u $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 %if 0%{?suse_version}
-getent group ceph >/dev/null || groupadd -r ceph
-getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+if ! getent group ceph >/dev/null ; then
+    CEPH_GROUP_ID_OPTION=""
+    getent group $CEPH_GROUP_ID >/dev/null || CEPH_GROUP_ID_OPTION="-g $CEPH_GROUP_ID"
+    groupadd ceph $CEPH_GROUP_ID_OPTION -r 2>/dev/null || :
+fi
+if ! getent passwd ceph >/dev/null ; then
+    CEPH_USER_ID_OPTION=""
+    getent passwd $CEPH_USER_ID >/dev/null || CEPH_USER_ID_OPTION="-u $CEPH_USER_ID"
+    useradd ceph $CEPH_USER_ID_OPTION -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2>/dev/null || :
 %endif
 exit 0
 
@@ -1182,6 +1192,9 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-osd.8*
+%if 0%{?rhel} && ! 0%{?centos}
+/etc/cron.hourly/subman
+%endif
 %if 0%{?_with_systemd}
 %{_unitdir}/ceph-osd at .service
 %{_unitdir}/ceph-osd.target
@@ -1220,7 +1233,7 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so.*
 %endif
 
@@ -1244,7 +1257,7 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librados_tp.so
 %endif
 %{_bindir}/librados-config
@@ -1279,7 +1292,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so.*
 %endif
 
@@ -1299,7 +1312,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
 %{_libdir}/librbd_tp.so
 %endif
 
diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh
index 06d7b6b..3b9b1d8 100755
--- a/src/test/osd/osd-scrub-repair.sh
+++ b/src/test/osd/osd-scrub-repair.sh
@@ -75,8 +75,13 @@ function corrupt_and_repair_two() {
     #
     # 1) remove the corresponding file from the OSDs
     #
-    objectstore_tool $dir $first SOMETHING remove || return 1
-    objectstore_tool $dir $second SOMETHING remove || return 1
+    pids=""
+    run_in_background pids objectstore_tool $dir $first SOMETHING remove
+    run_in_background pids objectstore_tool $dir $second SOMETHING remove
+    wait_background pids
+    return_code=$?
+    if [ $return_code -ne 0 ]; then return $return_code; fi
+
     #
     # 2) repair the PG
     #
@@ -85,8 +90,13 @@ function corrupt_and_repair_two() {
     #
     # 3) The files must be back
     #
-    objectstore_tool $dir $first SOMETHING list-attrs || return 1
-    objectstore_tool $dir $second SOMETHING list-attrs || return 1
+    pids=""
+    run_in_background pids objectstore_tool $dir $first SOMETHING list-attrs
+    run_in_background pids objectstore_tool $dir $second SOMETHING list-attrs
+    wait_background pids
+    return_code=$?
+    if [ $return_code -ne 0 ]; then return $return_code; fi
+
     rados --pool $poolname get SOMETHING $dir/COPY || return 1
     diff $dir/ORIGINAL $dir/COPY || return 1
 }
@@ -258,9 +268,14 @@ function TEST_unfound_erasure_coded() {
     #
     # 1) remove the corresponding file from the OSDs
     #
-    objectstore_tool $dir $not_primary_first SOMETHING remove || return 1
-    objectstore_tool $dir $not_primary_second SOMETHING remove || return 1
-    objectstore_tool $dir $not_primary_third SOMETHING remove || return 1
+    pids=""
+    run_in_background pids objectstore_tool $dir $not_primary_first SOMETHING remove
+    run_in_background pids objectstore_tool $dir $not_primary_second SOMETHING remove
+    run_in_background pids objectstore_tool $dir $not_primary_third SOMETHING remove
+    wait_background pids
+    return_code=$?
+    if [ $return_code -ne 0 ]; then return $return_code; fi
+
     #
     # 2) repair the PG
     #
@@ -299,14 +314,26 @@ function TEST_list_missing_erasure_coded() {
     # Put an object and remove the two shards (including primary)
     add_something $dir $poolname OBJ0 || return 1
     local -a osds=($(get_osds $poolname OBJ0))
-    objectstore_tool $dir ${osds[0]} OBJ0 remove || return 1
-    objectstore_tool $dir ${osds[1]} OBJ0 remove || return 1
+
+    pids=""
+    run_in_background pids objectstore_tool $dir ${osds[0]} OBJ0 remove
+    run_in_background pids objectstore_tool $dir ${osds[1]} OBJ0 remove
+    wait_background pids
+    return_code=$?
+    if [ $return_code -ne 0 ]; then return $return_code; fi
+
 
     # Put another object and remove two shards (excluding primary)
     add_something $dir $poolname OBJ1 || return 1
     local -a osds=($(get_osds $poolname OBJ1))
-    objectstore_tool $dir ${osds[1]} OBJ1 remove || return 1
-    objectstore_tool $dir ${osds[2]} OBJ1 remove || return 1
+
+    pids=""
+    run_in_background pids objectstore_tool $dir ${osds[1]} OBJ1 remove
+    run_in_background pids objectstore_tool $dir ${osds[2]} OBJ1 remove
+    wait_background pids
+    return_code=$?
+    if [ $return_code -ne 0 ]; then return $return_code; fi
+
 
     # Get get - both objects should in the same PG
     local pg=$(get_pg $poolname OBJ0)
diff --git a/src/test/perf_local.cc b/src/test/perf_local.cc
index 49440c2..7c33dc5 100644
--- a/src/test/perf_local.cc
+++ b/src/test/perf_local.cc
@@ -1020,6 +1020,7 @@ int main(int argc, char *argv[])
 
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
+  Cycles::init();
 
   bind_thread_to_cpu(3);
   if (argc == 1) {
diff --git a/src/test/rbd_mirror/image_replay.cc b/src/test/rbd_mirror/image_replay.cc
index 87d4497..88c6ac7 100644
--- a/src/test/rbd_mirror/image_replay.cc
+++ b/src/test/rbd_mirror/image_replay.cc
@@ -19,7 +19,7 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "rbd-mirror-image-replay: "
 
-rbd::mirror::ImageReplayer *replayer = nullptr;
+rbd::mirror::ImageReplayer<> *replayer = nullptr;
 
 void usage() {
   std::cout << "usage: ceph_test_rbd_mirror_image_replay [options...] \\" << std::endl;
@@ -103,8 +103,7 @@ int main(int argc, const char **argv)
 	  << local_pool_name << ", remote_pool_name=" << remote_pool_name
 	  << ", image_name=" << image_name << dendl;
 
-  rbd::mirror::ImageReplayer::BootstrapParams bootstap_params(local_pool_name,
-							      image_name);
+  rbd::mirror::ImageReplayer<>::BootstrapParams bootstap_params(image_name);
   int64_t local_pool_id;
   int64_t remote_pool_id;
   std::string remote_image_id;
@@ -185,9 +184,10 @@ int main(int argc, const char **argv)
 
   threads = new rbd::mirror::Threads(reinterpret_cast<CephContext*>(
     local->cct()));
-  replayer = new rbd::mirror::ImageReplayer(threads, local, remote, client_id,
-					    local_pool_id, remote_pool_id,
-					    remote_image_id);
+  replayer = new rbd::mirror::ImageReplayer<>(threads, local, remote, client_id,
+					      "", local_pool_id, remote_pool_id,
+					      remote_image_id,
+                                              "global image id");
 
   replayer->start(&start_cond, &bootstap_params);
   r = start_cond.wait();
diff --git a/src/test/rbd_mirror/image_sync/test_mock_SyncPointPruneRequest.cc b/src/test/rbd_mirror/image_sync/test_mock_SyncPointPruneRequest.cc
index 4558d6c..e45bf78 100644
--- a/src/test/rbd_mirror/image_sync/test_mock_SyncPointPruneRequest.cc
+++ b/src/test/rbd_mirror/image_sync/test_mock_SyncPointPruneRequest.cc
@@ -109,6 +109,7 @@ TEST_F(TestMockImageSyncSyncPointPruneRequest, SyncCompleteSuccess) {
   librbd::journal::MirrorPeerClientMeta client_meta;
   client_meta.sync_points.emplace_front("snap1", boost::none);
   m_client_meta = client_meta;
+  ASSERT_EQ(librbd::journal::MIRROR_PEER_STATE_SYNCING, m_client_meta.state);
 
   librbd::MockImageCtx mock_remote_image_ctx(*m_remote_image_ctx);
   journal::MockJournaler mock_journaler;
@@ -124,6 +125,7 @@ TEST_F(TestMockImageSyncSyncPointPruneRequest, SyncCompleteSuccess) {
   req->send();
   ASSERT_EQ(0, ctx.wait());
   ASSERT_TRUE(m_client_meta.sync_points.empty());
+  ASSERT_EQ(librbd::journal::MIRROR_PEER_STATE_REPLAYING, m_client_meta.state);
 }
 
 TEST_F(TestMockImageSyncSyncPointPruneRequest, RestartedSyncCompleteSuccess) {
diff --git a/src/test/rbd_mirror/mock/MockJournaler.cc b/src/test/rbd_mirror/mock/MockJournaler.cc
index 43f23e8..047dd2f 100644
--- a/src/test/rbd_mirror/mock/MockJournaler.cc
+++ b/src/test/rbd_mirror/mock/MockJournaler.cc
@@ -5,6 +5,11 @@
 
 namespace journal {
 
+MockReplayEntry *MockReplayEntry::s_instance = nullptr;
 MockJournaler *MockJournaler::s_instance = nullptr;
 
+std::ostream &operator<<(std::ostream &os, const MockJournalerProxy &) {
+  return os;
+}
+
 } // namespace journal
diff --git a/src/test/rbd_mirror/mock/MockJournaler.h b/src/test/rbd_mirror/mock/MockJournaler.h
index 5613eda..5f08c12 100644
--- a/src/test/rbd_mirror/mock/MockJournaler.h
+++ b/src/test/rbd_mirror/mock/MockJournaler.h
@@ -5,10 +5,48 @@
 #define TEST_RBD_MIRROR_MOCK_JOURNALER_H
 
 #include <gmock/gmock.h>
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "cls/journal/cls_journal_types.h"
 #include "librbd/Journal.h"
+#include "librbd/journal/TypeTraits.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+class ContextWQ;
+class Mutex;
+class SafeTimer;
 
 namespace journal {
 
+struct ReplayHandler;
+
+struct MockReplayEntry {
+  static MockReplayEntry *s_instance;
+  static MockReplayEntry &get_instance() {
+    assert(s_instance != nullptr);
+    return *s_instance;
+  }
+
+  MockReplayEntry() {
+    s_instance = this;
+  }
+
+  MOCK_CONST_METHOD0(get_commit_tid, uint64_t());
+  MOCK_METHOD0(get_data, bufferlist());
+};
+
+struct MockReplayEntryProxy {
+  uint64_t get_commit_tid() const {
+    return MockReplayEntry::get_instance().get_commit_tid();
+  }
+
+  bufferlist get_data() {
+    return MockReplayEntry::get_instance().get_data();
+  }
+};
+
 struct MockJournaler {
   static MockJournaler *s_instance;
   static MockJournaler &get_instance() {
@@ -20,9 +58,78 @@ struct MockJournaler {
     s_instance = this;
   }
 
+  MOCK_METHOD1(init, void(Context *));
+  MOCK_METHOD0(shut_down, void());
+  MOCK_CONST_METHOD0(is_initialized, bool());
+
+  MOCK_METHOD4(get_mutable_metadata, void(uint64_t*, uint64_t*,
+                                          std::set<cls::journal::Client> *,
+                                          Context*));
+
+  MOCK_METHOD2(try_pop_front, bool(MockReplayEntryProxy *, uint64_t *));
+  MOCK_METHOD2(start_live_replay, void(ReplayHandler *, double));
+  MOCK_METHOD0(stop_replay, void());
+
+  MOCK_METHOD1(committed, void(const MockReplayEntryProxy &));
+  MOCK_METHOD1(flush_commit_position, void(Context*));
+
   MOCK_METHOD2(update_client, void(const bufferlist&, Context *on_safe));
+
+  MOCK_METHOD3(get_tag, void(uint64_t, cls::journal::Tag *, Context *));
 };
 
+struct MockJournalerProxy {
+  MockJournalerProxy(ContextWQ *work_queue, SafeTimer *timer, Mutex *timer_lock,
+                     librados::IoCtx &header_ioctx, const std::string &journal_id,
+                     const std::string &client_id, double commit_interval) {
+    MockJournaler::get_instance();
+  }
+
+  void init(Context *on_finish) {
+    MockJournaler::get_instance().init(on_finish);
+  }
+  void shut_down() {
+    MockJournaler::get_instance().shut_down();
+  }
+  bool is_initialized() const {
+    return MockJournaler::get_instance().is_initialized();
+  }
+
+  void get_mutable_metadata(uint64_t *min, uint64_t *active,
+                            std::set<cls::journal::Client> *clients,
+                            Context *on_finish) {
+    MockJournaler::get_instance().get_mutable_metadata(min, active, clients,
+                                                       on_finish);
+  }
+
+  bool try_pop_front(MockReplayEntryProxy *entry, uint64_t *tag_tid) {
+    return MockJournaler::get_instance().try_pop_front(entry, tag_tid);
+  }
+  void start_live_replay(ReplayHandler *handler, double interval) {
+    MockJournaler::get_instance().start_live_replay(handler, interval);
+  }
+  void stop_replay() {
+    MockJournaler::get_instance().stop_replay();
+  }
+
+  void committed(const MockReplayEntryProxy &entry) {
+    MockJournaler::get_instance().committed(entry);
+  }
+  void flush_commit_position(Context *on_finish) {
+    MockJournaler::get_instance().flush_commit_position(on_finish);
+  }
+
+  void update_client(const bufferlist& data, Context *on_safe) {
+    MockJournaler::get_instance().update_client(data, on_safe);
+  }
+
+  void get_tag(uint64_t tag_tid, cls::journal::Tag *tag, Context *on_finish) {
+    MockJournaler::get_instance().get_tag(tag_tid, tag, on_finish);
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const MockJournalerProxy &);
+
 } // namespace journal
 
 namespace librbd {
diff --git a/src/test/rbd_mirror/test_ImageReplayer.cc b/src/test/rbd_mirror/test_ImageReplayer.cc
index b042f57..18f1441 100644
--- a/src/test/rbd_mirror/test_ImageReplayer.cc
+++ b/src/test/rbd_mirror/test_ImageReplayer.cc
@@ -71,7 +71,7 @@ public:
     }
   };
 
-  TestImageReplayer() : m_client_id("TestImageReplayer"), m_watch_handle(0)
+  TestImageReplayer() : m_watch_handle(0)
   {
     EXPECT_EQ("", connect_cluster_pp(m_local_cluster));
 
@@ -111,15 +111,16 @@ public:
     EXPECT_EQ(0, m_local_cluster.pool_delete(m_local_pool_name.c_str()));
   }
 
-  template <typename ImageReplayerT = rbd::mirror::ImageReplayer>
+  template <typename ImageReplayerT = rbd::mirror::ImageReplayer<> >
   void create_replayer() {
     m_replayer = new ImageReplayerT(m_threads,
       rbd::mirror::RadosRef(new librados::Rados(m_local_ioctx)),
       rbd::mirror::RadosRef(new librados::Rados(m_remote_ioctx)),
-      m_client_id, m_local_ioctx.get_id(), m_remote_pool_id, m_remote_image_id);
+      m_local_mirror_uuid, m_remote_mirror_uuid, m_local_ioctx.get_id(),
+      m_remote_pool_id, m_remote_image_id, "global image id");
   }
 
-  void start(rbd::mirror::ImageReplayer::BootstrapParams *bootstap_params =
+  void start(rbd::mirror::ImageReplayer<>::BootstrapParams *bootstap_params =
 	     nullptr)
   {
     C_SaferCond cond;
@@ -150,8 +151,8 @@ public:
   {
     create_replayer<>();
 
-    rbd::mirror::ImageReplayer::BootstrapParams
-      bootstap_params(m_local_pool_name, m_image_name);
+    rbd::mirror::ImageReplayer<>::BootstrapParams
+      bootstap_params(m_image_name);
     start(&bootstap_params);
     wait_for_replay_complete();
     stop();
@@ -198,7 +199,7 @@ public:
 			    cls::journal::ObjectPosition *mirror_position)
   {
     std::string master_client_id = "";
-    std::string mirror_client_id = m_client_id;
+    std::string mirror_client_id = m_local_mirror_uuid;
 
     C_SaferCond cond;
     uint64_t minimum_set;
@@ -256,7 +257,9 @@ public:
 
     for (int i = 0; i < 100; i++) {
       printf("m_replayer->flush()\n");
-      m_replayer->flush();
+      C_SaferCond cond;
+      m_replayer->flush(&cond);
+      ASSERT_EQ(0, cond.wait());
       get_commit_positions(&master_position, &mirror_position);
       if (master_position == mirror_position) {
 	break;
@@ -321,13 +324,14 @@ public:
 
   rbd::mirror::Threads *m_threads = nullptr;
   librados::Rados m_local_cluster, m_remote_cluster;
-  std::string m_client_id;
+  std::string m_local_mirror_uuid = "local mirror uuid";
+  std::string m_remote_mirror_uuid = "remote mirror uuid";
   std::string m_local_pool_name, m_remote_pool_name;
   librados::IoCtx m_local_ioctx, m_remote_ioctx;
   std::string m_image_name;
   int64_t m_remote_pool_id;
   std::string m_remote_image_id;
-  rbd::mirror::ImageReplayer *m_replayer;
+  rbd::mirror::ImageReplayer<> *m_replayer;
   C_WatchCtx *m_watch_ctx;
   uint64_t m_watch_handle;
   char m_test_data[TEST_IO_SIZE + 1];
@@ -340,17 +344,6 @@ TEST_F(TestImageReplayer, Bootstrap)
   bootstrap();
 }
 
-TEST_F(TestImageReplayer, BootstrapErrorInvalidPool)
-{
-  create_replayer<>();
-
-  rbd::mirror::ImageReplayer::BootstrapParams
-    bootstap_params("INVALID_LOCAL_POOL_NAME", m_image_name);
-  C_SaferCond cond;
-  m_replayer->start(&cond, &bootstap_params);
-  ASSERT_EQ(-ENOENT, cond.wait());
-}
-
 TEST_F(TestImageReplayer, BootstrapErrorLocalImageExists)
 {
   int order = 0;
@@ -358,8 +351,8 @@ TEST_F(TestImageReplayer, BootstrapErrorLocalImageExists)
 			      false, 0, &order, 0, 0));
 
   create_replayer<>();
-  rbd::mirror::ImageReplayer::BootstrapParams
-    bootstap_params(m_local_pool_name, m_image_name);
+  rbd::mirror::ImageReplayer<>::BootstrapParams
+    bootstap_params(m_image_name);
   C_SaferCond cond;
   m_replayer->start(&cond, &bootstap_params);
   ASSERT_EQ(-EEXIST, cond.wait());
@@ -376,8 +369,8 @@ TEST_F(TestImageReplayer, BootstrapErrorNoJournal)
   close_image(ictx);
 
   create_replayer<>();
-  rbd::mirror::ImageReplayer::BootstrapParams
-    bootstap_params(m_local_pool_name, m_image_name);
+  rbd::mirror::ImageReplayer<>::BootstrapParams
+    bootstap_params(m_image_name);
   C_SaferCond cond;
   m_replayer->start(&cond, &bootstap_params);
   ASSERT_EQ(-ENOENT, cond.wait());
@@ -386,8 +379,8 @@ TEST_F(TestImageReplayer, BootstrapErrorNoJournal)
 TEST_F(TestImageReplayer, StartInterrupted)
 {
   create_replayer<>();
-  rbd::mirror::ImageReplayer::BootstrapParams
-    bootstap_params(m_local_pool_name, m_image_name);
+  rbd::mirror::ImageReplayer<>::BootstrapParams
+    bootstap_params(m_image_name);
   C_SaferCond start_cond, stop_cond;
   m_replayer->start(&start_cond, &bootstap_params);
   m_replayer->stop(&stop_cond);
@@ -423,8 +416,8 @@ TEST_F(TestImageReplayer, ErrorNoJournal)
   ASSERT_EQ(0, librbd::update_features(ictx, RBD_FEATURE_JOURNALING, false));
   close_image(ictx);
 
-  rbd::mirror::ImageReplayer::BootstrapParams
-    bootstap_params(m_local_pool_name, m_image_name);
+  rbd::mirror::ImageReplayer<>::BootstrapParams
+    bootstap_params(m_image_name);
   C_SaferCond cond;
   m_replayer->start(&cond, &bootstap_params);
   ASSERT_EQ(-ENOENT, cond.wait());
@@ -535,14 +528,19 @@ TEST_F(TestImageReplayer, NextTag)
   stop();
 }
 
-class ImageReplayer : public rbd::mirror::ImageReplayer {
+class ImageReplayer : public rbd::mirror::ImageReplayer<> {
 public:
   ImageReplayer(rbd::mirror::Threads *threads,
 		rbd::mirror::RadosRef local, rbd::mirror::RadosRef remote,
-		const std::string &client_id, int64_t local_pool_id,
-		int64_t remote_pool_id,	const std::string &remote_image_id)
-    : rbd::mirror::ImageReplayer(threads, local, remote, client_id,
-				 local_pool_id, remote_pool_id, remote_image_id)
+		const std::string &local_mirror_uuid,
+                const std::string &remote_mirror_uuid,
+                int64_t local_pool_id,
+		int64_t remote_pool_id,	const std::string &remote_image_id,
+                const std::string &global_image_id)
+    : rbd::mirror::ImageReplayer<>(threads, local, remote, local_mirror_uuid,
+				   remote_mirror_uuid, local_pool_id,
+                                   remote_pool_id, remote_image_id,
+                                   global_image_id)
     {}
 
   void set_error(const std::string &state, int r) {
@@ -555,53 +553,15 @@ public:
   }
 
 protected:
-  virtual void on_start_get_registered_client_status_finish(int r,
-      const std::set<cls::journal::Client> &registered_clients,
-      const BootstrapParams &bootstrap_params) {
-      rbd::mirror::ImageReplayer::on_start_get_registered_client_status_finish(
-	get_error("on_start_get_registered_client_status"), registered_clients,
-	bootstrap_params);
-  }
-
-  virtual void on_start_remote_journaler_init_finish(int r) {
-    ASSERT_EQ(0, r);
-    rbd::mirror::ImageReplayer::on_start_remote_journaler_init_finish(
-      get_error("on_start_remote_journaler_init"));
-  }
-
-  virtual void on_start_local_image_open_finish(int r) {
-    int test_r = get_error("on_start_local_image_open");
-    if (!test_r) {
-      rbd::mirror::ImageReplayer::on_start_local_image_open_finish(r);
-      return;
-    }
-
-    // The image open error was imitated, so we need to close the image back
-    // before propagating the error.
-    ASSERT_EQ(0, r);
-    set_error("on_start_local_image_open", 0);
-    FunctionContext *ctx = new FunctionContext(
-      [this, test_r](int r) {
-	on_start_local_image_open_finish(test_r);
-      });
-    close_local_image(ctx);
-  }
-
-  virtual void on_start_wait_for_local_journal_ready_finish(int r) {
-    ASSERT_EQ(0, r);
-    rbd::mirror::ImageReplayer::on_start_wait_for_local_journal_ready_finish(
-      get_error("on_start_wait_for_local_journal_ready"));
-  }
-
   virtual void on_stop_journal_replay_shut_down_finish(int r) {
     ASSERT_EQ(0, r);
-    rbd::mirror::ImageReplayer::on_stop_journal_replay_shut_down_finish(
+    rbd::mirror::ImageReplayer<>::on_stop_journal_replay_shut_down_finish(
       get_error("on_stop_journal_replay_shut_down"));
   }
 
   virtual void on_stop_local_image_close_finish(int r) {
     ASSERT_EQ(0, r);
-    rbd::mirror::ImageReplayer::on_stop_local_image_close_finish(
+    rbd::mirror::ImageReplayer<>::on_stop_local_image_close_finish(
       get_error("on_stop_local_image_close"));
   }
 
@@ -615,8 +575,8 @@ TEST_F(TestImageReplayer, Error_on_start_##state)			\
   create_replayer<ImageReplayer>();					\
   reinterpret_cast<ImageReplayer *>(m_replayer)->			\
     set_error("on_start_" #state, -1);					\
-  rbd::mirror::ImageReplayer::BootstrapParams				\
-    bootstap_params(m_local_pool_name, m_image_name);			\
+  rbd::mirror::ImageReplayer<>::BootstrapParams				\
+    bootstap_params(m_image_name);			                \
   C_SaferCond cond;							\
   m_replayer->start(&cond, &bootstap_params);				\
   ASSERT_EQ(-1, cond.wait());						\
@@ -628,8 +588,8 @@ TEST_F(TestImageReplayer, Error_on_stop_##state)			\
   create_replayer<ImageReplayer>();					\
   reinterpret_cast<ImageReplayer *>(m_replayer)->			\
     set_error("on_stop_" #state, -1);					\
-  rbd::mirror::ImageReplayer::BootstrapParams				\
-    bootstap_params(m_local_pool_name, m_image_name);			\
+  rbd::mirror::ImageReplayer<>::BootstrapParams				\
+    bootstap_params(m_image_name);			                \
   start(&bootstap_params);						\
   /* TODO: investigate: without wait below I observe: */		\
   /* librbd/journal/Replay.cc: 70: FAILED assert(m_op_events.empty()) */\
@@ -639,10 +599,6 @@ TEST_F(TestImageReplayer, Error_on_stop_##state)			\
   ASSERT_EQ(0, cond.wait());						\
 }
 
-TEST_ON_START_ERROR(get_registered_client_status);
-TEST_ON_START_ERROR(remote_journaler_init);
-TEST_ON_START_ERROR(wait_for_local_journal_ready);
-
 TEST_ON_STOP_ERROR(journal_replay_shut_down);
 TEST_ON_STOP_ERROR(no_error);
 
diff --git a/src/test/rbd_mirror/test_ImageSync.cc b/src/test/rbd_mirror/test_ImageSync.cc
index 10622a1..e9b234e 100644
--- a/src/test/rbd_mirror/test_ImageSync.cc
+++ b/src/test/rbd_mirror/test_ImageSync.cc
@@ -8,6 +8,7 @@
 #include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageState.h"
+#include "librbd/journal/Types.h"
 #include "tools/rbd_mirror/ImageSync.h"
 #include "tools/rbd_mirror/Threads.h"
 
diff --git a/src/test/rbd_mirror/test_PoolWatcher.cc b/src/test/rbd_mirror/test_PoolWatcher.cc
index 5cb6805..b8ff311 100644
--- a/src/test/rbd_mirror/test_PoolWatcher.cc
+++ b/src/test/rbd_mirror/test_PoolWatcher.cc
@@ -140,8 +140,14 @@ TestPoolWatcher() : m_lock("TestPoolWatcherLock"),
       librbd::RBD rbd;
       rbd.open(ioctx, image, name.c_str());
       image.mirror_image_enable();
+
+      librbd::mirror_image_info_t mirror_image_info;
+      ASSERT_EQ(0, image.mirror_image_get_info(&mirror_image_info,
+                                               sizeof(mirror_image_info)));
       image.close();
-      m_mirrored_images[ioctx.get_id()].insert(get_image_id(&ioctx, name));
+
+      m_mirrored_images[ioctx.get_id()].insert(PoolWatcher::ImageIds(
+        get_image_id(&ioctx, name), mirror_image_info.global_id));
     }
     if (image_name != nullptr)
       *image_name = name;
@@ -179,8 +185,14 @@ TestPoolWatcher() : m_lock("TestPoolWatcherLock"),
       librbd::RBD rbd;
       rbd.open(cioctx, image, name.c_str());
       image.mirror_image_enable();
+
+      librbd::mirror_image_info_t mirror_image_info;
+      ASSERT_EQ(0, image.mirror_image_get_info(&mirror_image_info,
+                                               sizeof(mirror_image_info)));
       image.close();
-      m_mirrored_images[cioctx.get_id()].insert(get_image_id(&cioctx, name));
+
+      m_mirrored_images[cioctx.get_id()].insert(PoolWatcher::ImageIds(
+        get_image_id(&cioctx, name), mirror_image_info.global_id));
     }
     if (image_name != nullptr)
       *image_name = name;
@@ -198,7 +210,7 @@ TestPoolWatcher() : m_lock("TestPoolWatcherLock"),
   unique_ptr<PoolWatcher> m_pool_watcher;
 
   set<string> m_pools;
-  std::map<int64_t, std::set<std::string> > m_mirrored_images;
+  PoolWatcher::PoolImageIds m_mirrored_images;
 
   uint64_t m_image_number;
   uint64_t m_snap_number;
diff --git a/src/test/rbd_mirror/test_mock_ImageReplayer.cc b/src/test/rbd_mirror/test_mock_ImageReplayer.cc
new file mode 100644
index 0000000..ac4e8bc
--- /dev/null
+++ b/src/test/rbd_mirror/test_mock_ImageReplayer.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/rbd_mirror/test_mock_fixture.h"
+#include "librbd/journal/Replay.h"
+#include "tools/rbd_mirror/ImageReplayer.h"
+#include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
+#include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librbd/mock/MockJournal.h"
+#include "test/rbd_mirror/mock/MockJournaler.h"
+
+namespace librbd {
+
+struct MockImageReplayerJournal;
+
+struct MockImageReplayerImageCtx : public MockImageCtx {
+  MockImageReplayerJournal *journal = nullptr;
+};
+
+struct MockImageReplayerJournal : public MockJournal {
+  MOCK_METHOD1(start_external_replay, int(journal::Replay<MockImageReplayerImageCtx> **));
+  MOCK_METHOD0(stop_external_replay, void());
+};
+
+namespace journal {
+
+template<>
+struct Replay<MockImageReplayerImageCtx> {
+  MOCK_METHOD3(process, void(bufferlist::iterator *, Context *, Context *));
+  MOCK_METHOD1(flush, void(Context*));
+  MOCK_METHOD2(shut_down, void(bool, Context*));
+};
+
+template <>
+struct TypeTraits<MockImageReplayerImageCtx> {
+  typedef ::journal::MockJournalerProxy Journaler;
+  typedef ::journal::MockReplayEntryProxy ReplayEntry;
+};
+
+struct MirrorPeerClientMeta;
+
+} // namespace journal
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template<>
+struct BootstrapRequest<librbd::MockImageReplayerImageCtx> {
+  static BootstrapRequest* s_instance;
+  Context *on_finish = nullptr;
+
+  static BootstrapRequest* create(librados::IoCtx &local_io_ctx,
+                                  librados::IoCtx &remote_io_ctx,
+                                  librbd::MockImageReplayerImageCtx **local_image_ctx,
+                                  const std::string &local_image_name,
+                                  const std::string &remote_image_id,
+                                  const std::string &global_image_id,
+                                  ContextWQ *work_queue, SafeTimer *timer,
+                                  Mutex *timer_lock,
+                                  const std::string &local_mirror_uuid,
+                                  const std::string &remote_mirror_uuid,
+                                  ::journal::MockJournalerProxy *journaler,
+                                  librbd::journal::MirrorPeerClientMeta *client_meta,
+                                  Context *on_finish) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = on_finish;
+    return s_instance;
+  }
+
+  BootstrapRequest() {
+    assert(s_instance == nullptr);
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(send, void());
+};
+
+template<>
+struct CloseImageRequest<librbd::MockImageReplayerImageCtx> {
+  static CloseImageRequest* s_instance;
+  Context *on_finish = nullptr;
+
+  static CloseImageRequest* create(librbd::MockImageReplayerImageCtx **image_ctx,
+                                   ContextWQ *work_queue, bool destroy_only,
+                                   Context *on_finish) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = on_finish;
+    return s_instance;
+  }
+
+  CloseImageRequest() {
+    assert(s_instance == nullptr);
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(send, void());
+};
+
+template<>
+struct OpenLocalImageRequest<librbd::MockImageReplayerImageCtx> {
+  static OpenLocalImageRequest* s_instance;
+  Context *on_finish = nullptr;
+
+  static OpenLocalImageRequest* create(librados::IoCtx &local_io_ctx,
+                                       librbd::MockImageReplayerImageCtx **local_image_ctx,
+                                       const std::string &local_image_name,
+                                       const std::string &local_image_id,
+                                       ContextWQ *work_queue,
+                                       Context *on_finish) {
+    assert(s_instance != nullptr);
+    s_instance->on_finish = on_finish;
+    return s_instance;
+  }
+
+  OpenLocalImageRequest() {
+    assert(s_instance == nullptr);
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(send, void());
+};
+
+BootstrapRequest<librbd::MockImageReplayerImageCtx>* BootstrapRequest<librbd::MockImageReplayerImageCtx>::s_instance = nullptr;
+CloseImageRequest<librbd::MockImageReplayerImageCtx>* CloseImageRequest<librbd::MockImageReplayerImageCtx>::s_instance = nullptr;
+OpenLocalImageRequest<librbd::MockImageReplayerImageCtx>* OpenLocalImageRequest<librbd::MockImageReplayerImageCtx>::s_instance = nullptr;
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+// template definitions
+#include "tools/rbd_mirror/ImageReplayer.cc"
+template class rbd::mirror::ImageReplayer<librbd::MockImageReplayerImageCtx>;
+
+namespace rbd {
+namespace mirror {
+
+class TestMockImageReplayer : public TestMockFixture {
+public:
+  typedef ImageReplayer<librbd::MockImageReplayerImageCtx> MockImageReplayer;
+
+  virtual void SetUp() {
+    TestMockFixture::SetUp();
+
+    librbd::RBD rbd;
+    ASSERT_EQ(0, create_image(rbd, m_remote_io_ctx, m_image_name, m_image_size));
+    ASSERT_EQ(0, open_image(m_remote_io_ctx, m_image_name, &m_remote_image_ctx));
+  }
+
+  librbd::ImageCtx *m_remote_image_ctx;
+};
+
+TEST_F(TestMockImageReplayer, Blah) {
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/test/system/rados_delete_pools_parallel.cc b/src/test/system/rados_delete_pools_parallel.cc
index 5347c53..47d878c 100644
--- a/src/test/system/rados_delete_pools_parallel.cc
+++ b/src/test/system/rados_delete_pools_parallel.cc
@@ -57,7 +57,7 @@ const char *get_id_str()
 int main(int argc, const char **argv)
 {
   const char *num_objects = getenv("NUM_OBJECTS");
-  std::string pool = "foo";
+  const std::string pool = get_temp_pool_name(argv[0]);
   if (num_objects) {
     g_num_objects = atoi(num_objects); 
     if (g_num_objects == 0)
diff --git a/src/test/system/rados_list_parallel.cc b/src/test/system/rados_list_parallel.cc
index fb4540d..ff1cfae 100644
--- a/src/test/system/rados_list_parallel.cc
+++ b/src/test/system/rados_list_parallel.cc
@@ -221,7 +221,7 @@ const char *get_id_str()
 int main(int argc, const char **argv)
 {
   const char *num_objects = getenv("NUM_OBJECTS");
-  std::string pool = "foo." + stringify(getpid());
+  const std::string pool = get_temp_pool_name(argv[0]);
   if (num_objects) {
     g_num_objects = atoi(num_objects); 
     if (g_num_objects == 0)
diff --git a/src/test/system/rados_open_pools_parallel.cc b/src/test/system/rados_open_pools_parallel.cc
index 82c7120..fbaaf2a 100644
--- a/src/test/system/rados_open_pools_parallel.cc
+++ b/src/test/system/rados_open_pools_parallel.cc
@@ -49,9 +49,13 @@ class StRadosOpenPool : public SysTestRunnable
 {
 public:
   StRadosOpenPool(int argc, const char **argv,
-		  CrossProcessSem *pool_setup_sem, CrossProcessSem *open_pool_sem)
+                  CrossProcessSem *pool_setup_sem,
+                  CrossProcessSem *open_pool_sem,
+                  const std::string& pool_name)
     : SysTestRunnable(argc, argv),
-      m_pool_setup_sem(pool_setup_sem), m_open_pool_sem(open_pool_sem)
+      m_pool_setup_sem(pool_setup_sem),
+      m_open_pool_sem(open_pool_sem),
+      m_pool_name(pool_name)
   {
   }
 
@@ -74,10 +78,10 @@ public:
       m_pool_setup_sem->wait();
 
     printf("%s: rados_pool_create.\n", get_id_str());
-    rados_pool_create(cl, "foo");
+    rados_pool_create(cl, m_pool_name.c_str());
     rados_ioctx_t io_ctx;
     printf("%s: rados_ioctx_create.\n", get_id_str());
-    RETURN1_IF_NOT_VAL(0, rados_ioctx_create(cl, "foo", &io_ctx));
+    RETURN1_IF_NOT_VAL(0, rados_ioctx_create(cl, m_pool_name.c_str(), &io_ctx));
     if (m_open_pool_sem)
       m_open_pool_sem->post();
     rados_ioctx_destroy(io_ctx);
@@ -88,6 +92,7 @@ public:
 private:
   CrossProcessSem *m_pool_setup_sem;
   CrossProcessSem *m_open_pool_sem;
+  std::string m_pool_name;
 };
 
 const char *get_id_str()
@@ -97,13 +102,14 @@ const char *get_id_str()
 
 int main(int argc, const char **argv)
 {
+  const std::string pool = get_temp_pool_name(argv[0]);
   // first test: create a pool, shut down the client, access that 
   // pool in a different process.
   CrossProcessSem *pool_setup_sem = NULL;
   RETURN1_IF_NONZERO(CrossProcessSem::create(0, &pool_setup_sem));
   StRadosCreatePool r1(argc, argv, NULL, pool_setup_sem, NULL,
-					   "foo", 50, ".obj");
-  StRadosOpenPool r2(argc, argv, pool_setup_sem, NULL);
+					   pool, 50, ".obj");
+  StRadosOpenPool r2(argc, argv, pool_setup_sem, NULL, pool);
   vector < SysTestRunnable* > vec;
   vec.push_back(&r1);
   vec.push_back(&r2);
@@ -120,8 +126,8 @@ int main(int argc, const char **argv)
   CrossProcessSem *open_pool_sem2 = NULL;
   RETURN1_IF_NONZERO(CrossProcessSem::create(0, &open_pool_sem2));
   StRadosCreatePool r3(argc, argv, NULL, pool_setup_sem2, open_pool_sem2,
-					   "foo", 50, ".obj");
-  StRadosOpenPool r4(argc, argv, pool_setup_sem2, open_pool_sem2);
+					   pool, 50, ".obj");
+  StRadosOpenPool r4(argc, argv, pool_setup_sem2, open_pool_sem2, pool);
   vector < SysTestRunnable* > vec2;
   vec2.push_back(&r3);
   vec2.push_back(&r4);
diff --git a/src/test/system/st_rados_create_pool.cc b/src/test/system/st_rados_create_pool.cc
index 78083c7..aeec7ae 100644
--- a/src/test/system/st_rados_create_pool.cc
+++ b/src/test/system/st_rados_create_pool.cc
@@ -22,6 +22,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <sstream>
 #include <string>
 
@@ -93,7 +94,8 @@ run()
     std::string buf(get_random_buf(256));
     int ret = rados_write(io_ctx, oid, buf.c_str(), buf.size(), 0);
     if (ret != 0) {
-      printf("%s: rados_write error %d\n", get_id_str(), ret);
+      printf("%s: rados_write(%s) failed with error: %d\n",
+	     get_id_str(), oid, ret);
       ret_val = ret;
       goto out;
     }
@@ -112,3 +114,18 @@ out:
   rados_shutdown(cl);
   return ret_val;
 }
+
+std::string get_temp_pool_name(const char* prefix)
+{
+  assert(prefix);
+  char hostname[80];
+  int ret = 0;
+  ret = gethostname(hostname, sizeof(hostname));
+  assert(!ret);
+  char poolname[256];
+  ret = snprintf(poolname, sizeof(poolname),
+                 "%s.%s-%d", prefix, hostname, getpid());
+  assert(ret > 0);
+  assert((unsigned int)ret < sizeof(poolname));
+  return poolname;
+}
diff --git a/src/test/system/st_rados_create_pool.h b/src/test/system/st_rados_create_pool.h
index f0f8a3b..5554f3e 100644
--- a/src/test/system/st_rados_create_pool.h
+++ b/src/test/system/st_rados_create_pool.h
@@ -48,4 +48,6 @@ private:
   std::string m_suffix;
 };
 
+std::string get_temp_pool_name(const char* prefix);
+
 #endif
diff --git a/src/test/system/st_rados_list_objects.cc b/src/test/system/st_rados_list_objects.cc
index c53ab17..514dafe 100644
--- a/src/test/system/st_rados_list_objects.cc
+++ b/src/test/system/st_rados_list_objects.cc
@@ -36,6 +36,7 @@ StRadosListObjects(int argc, const char **argv,
 		   CrossProcessSem *midway_sem_wait,
 		   CrossProcessSem *midway_sem_post)
   : SysTestRunnable(argc, argv),
+    m_pool_name(pool_name),
     m_accept_list_errors(accept_list_errors),
     m_midway_cnt(midway_cnt),
     m_pool_setup_sem(pool_setup_sem),
@@ -63,8 +64,8 @@ run()
   m_pool_setup_sem->post();
 
   rados_ioctx_t io_ctx;
-  rados_pool_create(cl, "foo");
-  RETURN1_IF_NONZERO(rados_ioctx_create(cl, "foo", &io_ctx));
+  rados_pool_create(cl, m_pool_name.c_str());
+  RETURN1_IF_NONZERO(rados_ioctx_create(cl, m_pool_name.c_str(), &io_ctx));
 
   int saw = 0;
   const char *obj_name;
diff --git a/src/test/system/systest_runnable.h b/src/test/system/systest_runnable.h
index bd7d258..c19441c 100644
--- a/src/test/system/systest_runnable.h
+++ b/src/test/system/systest_runnable.h
@@ -36,7 +36,7 @@
   RETURN1_IF_NOT_VAL(0, expr)
 
 extern void* systest_runnable_pthread_helper(void *arg);
-
+std::string get_temp_pool_name(const char* prefix);
 /* Represents a single test thread / process.
  *
  * Inherit from this class and implement the test body in run().
diff --git a/src/test/test_pool_create.sh b/src/test/test_pool_create.sh
index 657b6dd..e8ba1c9 100755
--- a/src/test/test_pool_create.sh
+++ b/src/test/test_pool_create.sh
@@ -11,7 +11,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:17108"
+    export CEPH_MON="127.0.0.1:17109"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
index d2d22be..bb0584a 100644
--- a/src/tools/cephfs/DataScan.cc
+++ b/src/tools/cephfs/DataScan.cc
@@ -334,9 +334,8 @@ int MetadataDriver::inject_unlinked_inode(
   inode.inode.version = 1;
   inode.inode.xattr_version = 1;
   inode.inode.mode = 0500 | mode;
-  // Fake size to 1, so that the directory doesn't appear to be empty
-  // (we won't actually give the *correct* size here though)
-  inode.inode.size = 1;
+  // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
+  // (we won't actually give the *correct* dirstat here though)
   inode.inode.dirstat.nfiles = 1;
 
   inode.inode.ctime = 
@@ -994,17 +993,18 @@ int DataScan::scan_frags()
     if (r == -EINVAL) {
       derr << "Corrupt fnode on " << oid << dendl;
       if (force_corrupt) {
-        fnode.fragstat.mtime = 0;
-        fnode.fragstat.nfiles = 1;
-        fnode.fragstat.nsubdirs = 0;
+	fnode.fragstat.mtime = 0;
+	fnode.fragstat.nfiles = 1;
+	fnode.fragstat.nsubdirs = 0;
+	fnode.accounted_fragstat = fnode.fragstat;
       } else {
         return r;
       }
     }
 
     InodeStore dentry;
-    build_dir_dentry(obj_name_ino, fnode.fragstat.nfiles,
-        fnode.fragstat.mtime, loaded_layout, &dentry);
+    build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
+		loaded_layout, &dentry);
 
     // Inject inode to the metadata pool
     if (have_backtrace) {
@@ -1148,7 +1148,9 @@ int MetadataDriver::inject_lost_and_found(
     file_layout_t inherit_layout;
 
     // Construct LF inode
-    build_dir_dentry(CEPH_INO_LOST_AND_FOUND, 1, 0, inherit_layout, &lf_ino);
+    frag_info_t fragstat;
+    fragstat.nfiles = 1,
+    build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
 
     // Inject link to LF inode in the root dir
     r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
@@ -1421,7 +1423,6 @@ int MetadataDriver::inject_with_backtrace(
         // accurate, but it should avoid functional issues.
 
         ancestor_dentry.inode.dirstat.nfiles = 1;
-        ancestor_dentry.inode.size = 1;
 
         ancestor_dentry.inode.nlink = 1;
         ancestor_dentry.inode.ino = ino;
@@ -1478,6 +1479,9 @@ int MetadataDriver::find_or_create_dirfrag(
     bufferlist fnode_bl;
     fnode_t blank_fnode;
     blank_fnode.version = 1;
+    // mark it as non-empty
+    blank_fnode.fragstat.nfiles = 1;
+    blank_fnode.accounted_fragstat = blank_fnode.fragstat;
     blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
     blank_fnode.encode(fnode_bl);
 
@@ -1748,18 +1752,16 @@ void MetadataTool::build_file_dentry(
 }
 
 void MetadataTool::build_dir_dentry(
-    inodeno_t ino, uint64_t nfiles,
-    time_t mtime, const file_layout_t &layout, InodeStore *out)
+    inodeno_t ino, const frag_info_t &fragstat,
+    const file_layout_t &layout, InodeStore *out)
 {
   assert(out != NULL);
 
   out->inode.mode = 0755 | S_IFDIR;
-  out->inode.size = nfiles;
-  out->inode.dirstat.nfiles = nfiles;
-  out->inode.max_size_ever = nfiles;
-  out->inode.mtime.tv.tv_sec = mtime;
-  out->inode.atime.tv.tv_sec = mtime;
-  out->inode.ctime.tv.tv_sec = mtime;
+  out->inode.dirstat = fragstat;
+  out->inode.mtime.tv.tv_sec = fragstat.mtime;
+  out->inode.atime.tv.tv_sec = fragstat.mtime;
+  out->inode.ctime.tv.tv_sec = fragstat.mtime;
 
   out->inode.layout = layout;
 
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
index 107ab74..3a6b25c 100644
--- a/src/tools/cephfs/DataScan.h
+++ b/src/tools/cephfs/DataScan.h
@@ -155,8 +155,8 @@ class MetadataTool
    * Construct a synthetic InodeStore for a directory
    */
   void build_dir_dentry(
-    inodeno_t ino, uint64_t nfiles,
-    time_t mtime,
+    inodeno_t ino,
+    const frag_info_t &fragstat,
     const file_layout_t &layout,
     InodeStore *out);
 
diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h
index 97b56a7..f95a062 100644
--- a/src/tools/cephfs/Dumper.h
+++ b/src/tools/cephfs/Dumper.h
@@ -36,8 +36,6 @@ public:
   Dumper() : ino(-1)
   {}
 
-  void handle_mds_map(MFSMap* m);
-
   int init(mds_role_t role_);
   int recover_journal(Journaler *journaler);
   int dump(const char *dumpfile);
diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc
index b6b9eab..eb61915 100644
--- a/src/tools/cephfs/MDSUtility.cc
+++ b/src/tools/cephfs/MDSUtility.cc
@@ -131,7 +131,7 @@ bool MDSUtility::ms_dispatch(Message *m)
    Mutex::Locker locker(lock);
    switch (m->get_type()) {
    case CEPH_MSG_FS_MAP:
-     handle_mds_map((MFSMap*)m);
+     handle_fs_map((MFSMap*)m);
      break;
    case CEPH_MSG_OSD_MAP:
      break;
@@ -142,9 +142,9 @@ bool MDSUtility::ms_dispatch(Message *m)
 }
 
 
-void MDSUtility::handle_mds_map(MFSMap* m)
+void MDSUtility::handle_fs_map(MFSMap* m)
 {
-  fsmap->decode(m->get_encoded());
+  *fsmap = m->get_fsmap();
   if (waiting_for_mds_map) {
     waiting_for_mds_map->complete(0);
     waiting_for_mds_map = NULL;
diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h
index 4d233c9..0f7f80a 100644
--- a/src/tools/cephfs/MDSUtility.h
+++ b/src/tools/cephfs/MDSUtility.h
@@ -46,7 +46,7 @@ public:
   MDSUtility();
   ~MDSUtility();
 
-  void handle_mds_map(MFSMap* m);
+  void handle_fs_map(MFSMap* m);
   bool ms_dispatch(Message *m);
   bool ms_handle_reset(Connection *con) { return false; }
   void ms_handle_remote_reset(Connection *con) {}
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index 5d2d542..7e04389 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -583,7 +583,12 @@ int main(int argc, const char **argv)
       }
     }
     bufferlist::iterator p = bl.begin();
-    crush.decode(p);
+    try {
+      crush.decode(p);
+    } catch(...) {
+      cerr << me << ": unable to decode " << infn << std::endl;
+      exit(EXIT_FAILURE);
+    }
   }
 
   if (compile) {
diff --git a/src/tools/rbd/Shell.cc b/src/tools/rbd/Shell.cc
index 76aa8ed..5fba993 100644
--- a/src/tools/rbd/Shell.cc
+++ b/src/tools/rbd/Shell.cc
@@ -78,11 +78,10 @@ std::set<std::string>& Shell::get_switch_arguments() {
   return switch_arguments;
 }
 
-int Shell::execute(int arg_count, const char **arg_values) {
-
-  std::vector<std::string> arguments;
-  prune_command_line_arguments(arg_count, arg_values, &arguments);
+int Shell::execute(const Arguments& cmdline_arguments) {
 
+  std::vector<std::string> arguments(cmdline_arguments.begin(),
+                                     cmdline_arguments.end());
   std::vector<std::string> command_spec;
   get_command_spec(arguments, &command_spec);
 
@@ -255,32 +254,6 @@ void Shell::get_global_options(po::options_description *opts) {
     ("keyring,k", po::value<std::string>(), "path to keyring");
 }
 
-void Shell::prune_command_line_arguments(int arg_count, const char **arg_values,
-                                         std::vector<std::string> *args) {
-
-  std::vector<std::string> config_keys;
-  g_conf->get_all_keys(&config_keys);
-  std::set<std::string> config_key_set(config_keys.begin(), config_keys.end());
-
-  args->reserve(arg_count);
-  for (int i = 1; i < arg_count; ++i) {
-    std::string arg(arg_values[i]);
-    if (arg.size() > 2 && arg.substr(0, 2) == "--") {
-      std::string option_name(arg.substr(2));
-      std::string alt_option_name(option_name);
-      std::replace(alt_option_name.begin(), alt_option_name.end(), '-', '_');
-      if (config_key_set.count(option_name) ||
-          config_key_set.count(alt_option_name)) {
-        // Ceph config override -- skip since it's handled by CephContext
-        ++i;
-        continue;
-      }
-    }
-
-    args->push_back(arg);
-  }
-}
-
 void Shell::print_help() {
   std::cout << "usage: " << APP_NAME << " <command> ..."
             << std::endl << std::endl
diff --git a/src/tools/rbd/Shell.h b/src/tools/rbd/Shell.h
index b65483e..6792130 100644
--- a/src/tools/rbd/Shell.h
+++ b/src/tools/rbd/Shell.h
@@ -14,6 +14,7 @@ namespace rbd {
 
 class Shell {
 public:
+  typedef std::vector<const char *> Arguments;
   typedef std::vector<std::string> CommandSpec;
 
   struct Action {
@@ -47,7 +48,7 @@ public:
     }
   };
 
-  int execute(int arg_count, const char **arg_values);
+  int execute(const Arguments &argument);
 
 private:
   static std::vector<Action *>& get_actions();
@@ -59,8 +60,6 @@ private:
                       CommandSpec **matching_spec);
 
   void get_global_options(boost::program_options::options_description *opts);
-  void prune_command_line_arguments(int arg_count, const char **arg_values,
-                                    std::vector<std::string> *args);
 
   void print_help();
   void print_action_help(Action *action);
diff --git a/src/tools/rbd/action/MirrorImage.cc b/src/tools/rbd/action/MirrorImage.cc
index d3a322c..ecb838f 100644
--- a/src/tools/rbd/action/MirrorImage.cc
+++ b/src/tools/rbd/action/MirrorImage.cc
@@ -121,7 +121,7 @@ int execute_promote(const po::variables_map &vm) {
     return r;
   }
 
-  std::cout << "" << std::endl;
+  std::cout << "Image promoted to primary" << std::endl;
   return 0;
 }
 
@@ -148,11 +148,11 @@ int execute_demote(const po::variables_map &vm) {
 
   r = image.mirror_image_demote();
   if (r < 0) {
-    std::cerr << "rbd: error demoting image to secondary" << std::endl;
+    std::cerr << "rbd: error demoting image to non-primary" << std::endl;
     return r;
   }
 
-  std::cout << "Image demoted to secondary" << std::endl;
+  std::cout << "Image demoted to non-primary" << std::endl;
   return 0;
 }
 
@@ -201,7 +201,7 @@ Shell::Action action_promote(
   &get_arguments_promote, &execute_promote);
 Shell::Action action_demote(
   {"mirror", "image", "demote"}, {},
-  "Demote an image to secondary for RBD mirroring.", "",
+  "Demote an image to non-primary for RBD mirroring.", "",
   &get_arguments, &execute_demote);
 Shell::Action action_resync(
   {"mirror", "image", "resync"}, {},
diff --git a/src/tools/rbd/rbd.cc b/src/tools/rbd/rbd.cc
index a83db24..bfa18d3 100644
--- a/src/tools/rbd/rbd.cc
+++ b/src/tools/rbd/rbd.cc
@@ -16,5 +16,5 @@ int main(int argc, const char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
 
   rbd::Shell shell;
-  return shell.execute(argc, argv);
+  return shell.execute(args);
 }
diff --git a/src/tools/rbd_mirror/ImageReplayer.cc b/src/tools/rbd_mirror/ImageReplayer.cc
index 25afe93..df297b0 100644
--- a/src/tools/rbd_mirror/ImageReplayer.cc
+++ b/src/tools/rbd_mirror/ImageReplayer.cc
@@ -9,7 +9,6 @@
 #include "common/Timer.h"
 #include "common/WorkQueue.h"
 #include "journal/Journaler.h"
-#include "journal/ReplayEntry.h"
 #include "journal/ReplayHandler.h"
 #include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
@@ -17,7 +16,6 @@
 #include "librbd/Journal.h"
 #include "librbd/Operations.h"
 #include "librbd/Utils.h"
-#include "librbd/internal.h"
 #include "librbd/journal/Replay.h"
 #include "ImageReplayer.h"
 #include "ImageSync.h"
@@ -41,11 +39,16 @@ namespace mirror {
 using librbd::util::create_context_callback;
 using namespace rbd::mirror::image_replayer;
 
+template <typename I>
+std::ostream &operator<<(std::ostream &os,
+                         const typename ImageReplayer<I>::State &state);
+
 namespace {
 
+template <typename I>
 struct ReplayHandler : public ::journal::ReplayHandler {
-  ImageReplayer *replayer;
-  ReplayHandler(ImageReplayer *replayer) : replayer(replayer) {}
+  ImageReplayer<I> *replayer;
+  ReplayHandler(ImageReplayer<I> *replayer) : replayer(replayer) {}
 
   virtual void get() {}
   virtual void put() {}
@@ -58,50 +61,35 @@ struct ReplayHandler : public ::journal::ReplayHandler {
   }
 };
 
-struct C_ReplayCommitted : public Context {
-  ImageReplayer *replayer;
-  ::journal::ReplayEntry replay_entry;
-
-  C_ReplayCommitted(ImageReplayer *replayer, ::journal::ReplayEntry &&replay_entry) :
-    replayer(replayer), replay_entry(std::move(replay_entry)) {
-  }
-  virtual void finish(int r) {
-    replayer->handle_replay_committed(&replay_entry, r);
-  }
-};
-
 class ImageReplayerAdminSocketCommand {
 public:
   virtual ~ImageReplayerAdminSocketCommand() {}
   virtual bool call(Formatter *f, stringstream *ss) = 0;
 };
 
+template <typename I>
 class StatusCommand : public ImageReplayerAdminSocketCommand {
 public:
-  explicit StatusCommand(ImageReplayer *replayer) : replayer(replayer) {}
+  explicit StatusCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
 
   bool call(Formatter *f, stringstream *ss) {
-    if (f) {
-      f->open_object_section("status");
-      f->dump_stream("state") << replayer->get_state();
-      f->close_section();
-      f->flush(*ss);
-    } else {
-      *ss << "state: " << replayer->get_state();
-    }
+    replayer->print_status(f, ss);
     return true;
   }
 
 private:
-  ImageReplayer *replayer;
+  ImageReplayer<I> *replayer;
 };
 
+template <typename I>
 class FlushCommand : public ImageReplayerAdminSocketCommand {
 public:
-  explicit FlushCommand(ImageReplayer *replayer) : replayer(replayer) {}
+  explicit FlushCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
 
   bool call(Formatter *f, stringstream *ss) {
-    int r = replayer->flush();
+    C_SaferCond cond;
+    replayer->flush(&cond);
+    int r = cond.wait();
     if (r < 0) {
       *ss << "flush: " << cpp_strerror(r);
       return false;
@@ -110,15 +98,14 @@ public:
   }
 
 private:
-  ImageReplayer *replayer;
+  ImageReplayer<I> *replayer;
 };
 
-} // anonymous namespace
-
+template <typename I>
 class ImageReplayerAdminSocketHook : public AdminSocketHook {
 public:
   ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name,
-			       ImageReplayer *replayer) :
+			       ImageReplayer<I> *replayer) :
     admin_socket(cct->get_admin_socket()) {
     std::string command;
     int r;
@@ -127,14 +114,14 @@ public:
     r = admin_socket->register_command(command, command, this,
 				       "get status for rbd mirror " + name);
     if (r == 0) {
-      commands[command] = new StatusCommand(replayer);
+      commands[command] = new StatusCommand<I>(replayer);
     }
 
     command = "rbd mirror flush " + name;
     r = admin_socket->register_command(command, command, this,
 				       "flush rbd mirror " + name);
     if (r == 0) {
-      commands[command] = new FlushCommand(replayer);
+      commands[command] = new FlushCommand<I>(replayer);
     }
   }
 
@@ -165,18 +152,26 @@ private:
   Commands commands;
 };
 
-ImageReplayer::ImageReplayer(Threads *threads, RadosRef local, RadosRef remote,
-			     const std::string &client_id,
+} // anonymous namespace
+
+template <typename I>
+ImageReplayer<I>::ImageReplayer(Threads *threads, RadosRef local, RadosRef remote,
+			     const std::string &local_mirror_uuid,
+			     const std::string &remote_mirror_uuid,
 			     int64_t local_pool_id,
 			     int64_t remote_pool_id,
-			     const std::string &remote_image_id) :
+			     const std::string &remote_image_id,
+                             const std::string &global_image_id) :
   m_threads(threads),
   m_local(local),
   m_remote(remote),
-  m_client_id(client_id),
+  m_local_mirror_uuid(local_mirror_uuid),
+  m_remote_mirror_uuid(remote_mirror_uuid),
   m_remote_pool_id(remote_pool_id),
   m_local_pool_id(local_pool_id),
   m_remote_image_id(remote_image_id),
+  m_global_image_id(global_image_id),
+  m_name(stringify(remote_pool_id) + "/" + remote_image_id),
   m_lock("rbd::mirror::ImageReplayer " + stringify(remote_pool_id) + " " +
 	 remote_image_id),
   m_state(STATE_UNINITIALIZED),
@@ -184,35 +179,36 @@ ImageReplayer::ImageReplayer(Threads *threads, RadosRef local, RadosRef remote,
   m_local_replay(nullptr),
   m_remote_journaler(nullptr),
   m_replay_handler(nullptr),
-  m_on_finish(nullptr),
   m_asok_hook(nullptr)
 {
 }
 
-ImageReplayer::~ImageReplayer()
+template <typename I>
+ImageReplayer<I>::~ImageReplayer()
 {
   assert(m_local_image_ctx == nullptr);
   assert(m_local_replay == nullptr);
   assert(m_remote_journaler == nullptr);
   assert(m_replay_handler == nullptr);
-
+  assert(m_on_start_finish == nullptr);
+  assert(m_on_stop_finish == nullptr);
   delete m_asok_hook;
 }
 
-void ImageReplayer::start(Context *on_finish,
-			  const BootstrapParams *bootstrap_params)
+template <typename I>
+void ImageReplayer<I>::start(Context *on_finish,
+			     const BootstrapParams *bootstrap_params)
 {
-  dout(20) << "on_finish=" << on_finish << ", m_on_finish=" << m_on_finish
-	   << dendl;
+  assert(m_on_start_finish == nullptr);
+  assert(m_on_stop_finish == nullptr);
+  dout(20) << "on_finish=" << on_finish << dendl;
 
   {
     Mutex::Locker locker(m_lock);
     assert(is_stopped_());
 
     m_state = STATE_STARTING;
-
-    assert(m_on_finish == nullptr);
-    m_on_finish = on_finish;
+    m_on_start_finish = on_finish;
   }
 
   int r = m_remote->ioctx_create2(m_remote_pool_id, m_remote_ioctx);
@@ -224,14 +220,7 @@ void ImageReplayer::start(Context *on_finish,
   }
 
   if (bootstrap_params != nullptr && !bootstrap_params->empty()) {
-    r = m_local->pool_lookup(bootstrap_params->local_pool_name.c_str());
-    if (r < 0) {
-      derr << "error finding local pool " << bootstrap_params->local_pool_name
-           << ": " << cpp_strerror(r) << dendl;
-      on_start_fail_start(r);
-      return;
-    }
-    m_local_pool_id = r;
+    m_local_image_name = bootstrap_params->local_image_name;
   }
 
   r = m_local->ioctx_create2(m_local_pool_id, m_local_ioctx);
@@ -243,242 +232,89 @@ void ImageReplayer::start(Context *on_finish,
   }
 
   CephContext *cct = static_cast<CephContext *>(m_local->cct());
-
   double commit_interval = cct->_conf->rbd_journal_commit_age;
-  m_remote_journaler = new ::journal::Journaler(m_threads->work_queue,
-						m_threads->timer,
-						&m_threads->timer_lock,
-						m_remote_ioctx,
-						m_remote_image_id, m_client_id,
-						commit_interval);
-
-  on_start_get_registered_client_status_start(bootstrap_params);
-}
-
-void ImageReplayer::on_start_get_registered_client_status_start(
-  const BootstrapParams *bootstrap_params)
-{
-  dout(20) << "enter" << dendl;
-
-  struct Metadata {
-    uint64_t minimum_set;
-    uint64_t active_set;
-    std::set<cls::journal::Client> registered_clients;
-    BootstrapParams bootstrap_params;
-  } *m = new Metadata();
-
-  if (bootstrap_params) {
-    m->bootstrap_params = *bootstrap_params;
-  }
+  m_remote_journaler = new Journaler(m_threads->work_queue,
+                                     m_threads->timer,
+				     &m_threads->timer_lock, m_remote_ioctx,
+				     m_remote_image_id, m_local_mirror_uuid,
+                                     commit_interval);
 
-  FunctionContext *ctx = new FunctionContext(
-    [this, m, bootstrap_params](int r) {
-      on_start_get_registered_client_status_finish(r, m->registered_clients,
-						   m->bootstrap_params);
-      delete m;
-    });
-
-  m_remote_journaler->get_mutable_metadata(&m->minimum_set, &m->active_set,
-					   &m->registered_clients, ctx);
+  bootstrap();
 }
 
-void ImageReplayer::on_start_get_registered_client_status_finish(int r,
-  const std::set<cls::journal::Client> &registered_clients,
-  const BootstrapParams &bootstrap_params)
-{
-  dout(20) << "r=" << r << dendl;
-
-  if (r < 0) {
-    derr << "error obtaining registered client status: "
-	 << cpp_strerror(r) << dendl;
-    on_start_fail_start(r);
-    return;
-  }
-  if (on_start_interrupted()) {
-    return;
-  }
-
-  for (auto c : registered_clients) {
-    if (c.id == m_client_id) {
-      librbd::journal::ClientData client_data;
-      bufferlist::iterator bl = c.data.begin();
-      try {
-	::decode(client_data, bl);
-      } catch (const buffer::error &err) {
-	derr << "failed to decode client meta data: " << err.what() << dendl;
-	on_start_fail_start(-EINVAL);
-	return;
-      }
-
-      // TODO: unsafe cast
-      m_client_meta =
-	boost::get<librbd::journal::MirrorPeerClientMeta>(client_data.client_meta);
-      m_local_image_id = m_client_meta.image_id;
-
-      dout(20) << "client found, pool_id=" << m_local_pool_id << ", image_id="
-	       << m_local_image_id << dendl;
-
-      if (!bootstrap_params.empty()) {
-	dout(0) << "ignoring bootsrap params: client already registered" << dendl;
-      }
-
-      on_start_remote_journaler_init_start();
-      return;
-    }
-  }
-
-  dout(20) << "client not found" << dendl;
-  bootstrap(bootstrap_params);
-}
-
-void ImageReplayer::bootstrap(const BootstrapParams &bootstrap_params) {
-  int r;
-  BootstrapParams params;
-
-  if (!bootstrap_params.empty()) {
-    dout(20) << "using external bootstrap params" << dendl;
-    params = bootstrap_params;
-  } else {
-    r = get_bootstrap_params(&params);
-    if (r < 0) {
-      derr << "error obtaining bootstrap parameters: "
-	   << cpp_strerror(r) << dendl;
-      on_start_fail_start(r);
-      return;
-    }
-  }
-
+template <typename I>
+void ImageReplayer<I>::bootstrap() {
   dout(20) << "bootstrap params: "
-           << "local_pool_name=" << params.local_pool_name << ", "
-	   << "local_image_name=" << params.local_image_name << dendl;
+	   << "local_image_name=" << m_local_image_name << dendl;
 
   // TODO: add a new bootstrap state and support canceling
   Context *ctx = create_context_callback<
-    ImageReplayer, &ImageReplayer::handle_bootstrap>(this);
-  BootstrapRequest<> *request = BootstrapRequest<>::create(
+    ImageReplayer, &ImageReplayer<I>::handle_bootstrap>(this);
+  BootstrapRequest<I> *request = BootstrapRequest<I>::create(
     m_local_ioctx, m_remote_ioctx, &m_local_image_ctx,
-    params.local_image_name, m_remote_image_id, m_threads->work_queue,
-    m_threads->timer, &m_threads->timer_lock, m_client_id, m_remote_journaler,
+    m_local_image_name, m_remote_image_id, m_global_image_id,
+    m_threads->work_queue, m_threads->timer, &m_threads->timer_lock,
+    m_local_mirror_uuid, m_remote_mirror_uuid, m_remote_journaler,
     &m_client_meta, ctx);
   request->send();
 }
 
-void ImageReplayer::handle_bootstrap(int r) {
+template <typename I>
+void ImageReplayer<I>::handle_bootstrap(int r) {
   dout(20) << "r=" << r << dendl;
 
-  if (r < 0) {
-    on_start_fail_start(r);
+  if (r == -EREMOTEIO) {
+    dout(5) << "remote image is non-primary or local image is primary" << dendl;
+    on_start_fail_start(0);
     return;
-  }
-  if (on_start_interrupted()) {
+  } else if (r < 0) {
+    on_start_fail_start(r);
     return;
-  }
-
-  on_start_remote_journaler_init_start();
-}
-
-void ImageReplayer::on_start_remote_journaler_init_start()
-{
-  if (on_start_interrupted()) {
+  } else if (on_start_interrupted()) {
     return;
   }
 
-  dout(20) << "enter" << dendl;
-
-  FunctionContext *ctx = new FunctionContext(
-    [this](int r) {
-      on_start_remote_journaler_init_finish(r);
-    });
-
-  m_remote_journaler->init(ctx);
-}
-
-void ImageReplayer::on_start_remote_journaler_init_finish(int r)
-{
-  dout(20) << "r=" << r << dendl;
+  {
+    Mutex::Locker locker(m_lock);
+    m_name = m_local_ioctx.get_pool_name() + "/" + m_local_image_ctx->name;
 
-  if (r < 0) {
-    derr << "error initializing journal: " << cpp_strerror(r) << dendl;
-    on_start_fail_start(r);
-    return;
-  }
-  if (on_start_interrupted()) {
-    return;
+    CephContext *cct = static_cast<CephContext *>(m_local->cct());
+    delete m_asok_hook;
+    m_asok_hook = new ImageReplayerAdminSocketHook<I>(cct, m_name, this);
   }
 
-
-  on_start_local_image_open_start();
+  init_remote_journaler();
 }
 
-void ImageReplayer::on_start_local_image_open_start()
-{
-  dout(20) << "enter" << dendl;
-  if (m_local_image_ctx != nullptr) {
-    // already opened during bootstrap
-    on_start_wait_for_local_journal_ready_start();
-    return;
-  }
+template <typename I>
+void ImageReplayer<I>::init_remote_journaler() {
+  dout(20) << dendl;
 
-  // open and lock the local image
   Context *ctx = create_context_callback<
-    ImageReplayer, &ImageReplayer::on_start_local_image_open_finish>(this);
-  OpenLocalImageRequest<> *request = OpenLocalImageRequest<>::create(
-    m_local_ioctx, &m_local_image_ctx, "", m_local_image_id,
-    m_threads->work_queue, ctx);
-  request->send();
+    ImageReplayer, &ImageReplayer<I>::handle_init_remote_journaler>(this);
+  m_remote_journaler->init(ctx);
 }
 
-void ImageReplayer::on_start_local_image_open_finish(int r)
-{
+template <typename I>
+void ImageReplayer<I>::handle_init_remote_journaler(int r) {
   dout(20) << "r=" << r << dendl;
 
   if (r < 0) {
-    derr << "error opening local image " <<  m_local_image_id
-	 << ": " << cpp_strerror(r) << dendl;
+    derr << "failed to initialize remote journal: " << cpp_strerror(r) << dendl;
     on_start_fail_start(r);
     return;
-  }
-  if (on_start_interrupted()) {
+  } else if (on_start_interrupted()) {
     return;
   }
 
-  on_start_wait_for_local_journal_ready_start();
-}
-
-void ImageReplayer::on_start_wait_for_local_journal_ready_start()
-{
-  dout(20) << "enter" << dendl;
-
-  if (!m_asok_hook) {
-    CephContext *cct = static_cast<CephContext *>(m_local->cct());
-    std::string name = m_local_ioctx.get_pool_name() + "/" +
-      m_local_image_ctx->name;
-
-    m_asok_hook = new ImageReplayerAdminSocketHook(cct, name, this);
-  }
-
-  FunctionContext *ctx = new FunctionContext(
-    [this](int r) {
-      on_start_wait_for_local_journal_ready_finish(r);
-    });
-  m_local_image_ctx->journal->wait_for_journal_ready(ctx);
+  start_replay();
 }
 
-void ImageReplayer::on_start_wait_for_local_journal_ready_finish(int r)
-{
-  dout(20) << "r=" << r << dendl;
-
-  if (r < 0) {
-    derr << "error when waiting for local journal ready: " << cpp_strerror(r)
-	 << dendl;
-    on_start_fail_start(r);
-    return;
-  }
-  if (on_start_interrupted()) {
-    return;
-  }
+template <typename I>
+void ImageReplayer<I>::start_replay() {
+  dout(20) << dendl;
 
-  r = m_local_image_ctx->journal->start_external_replay(&m_local_replay);
+  int r = m_local_image_ctx->journal->start_external_replay(&m_local_replay);
   if (r < 0) {
     derr << "error starting external replay on local image "
 	 <<  m_local_image_id << ": " << cpp_strerror(r) << dendl;
@@ -486,8 +322,7 @@ void ImageReplayer::on_start_wait_for_local_journal_ready_finish(int r)
     return;
   }
 
-  m_replay_handler = new ReplayHandler(this);
-
+  m_replay_handler = new ReplayHandler<I>(this);
   m_remote_journaler->start_live_replay(m_replay_handler,
 					1 /* TODO: configurable */);
 
@@ -496,30 +331,27 @@ void ImageReplayer::on_start_wait_for_local_journal_ready_finish(int r)
   assert(r == 0);
 
   Context *on_finish(nullptr);
-
   {
     Mutex::Locker locker(m_lock);
-
-    if (m_state == STATE_STOPPING) {
+    if (m_stop_requested) {
       on_start_fail_start(-EINTR);
       return;
     }
 
     assert(m_state == STATE_STARTING);
     m_state = STATE_REPLAYING;
-
-    std::swap(m_on_finish, on_finish);
+    std::swap(m_on_start_finish, on_finish);
   }
 
   dout(20) << "start succeeded" << dendl;
-
-  if (on_finish) {
+  if (on_finish != nullptr) {
     dout(20) << "on finish complete, r=" << r << dendl;
     on_finish->complete(r);
   }
 }
 
-void ImageReplayer::on_start_fail_start(int r)
+template <typename I>
+void ImageReplayer<I>::on_start_fail_start(int r)
 {
   dout(20) << "r=" << r << dendl;
 
@@ -532,7 +364,8 @@ void ImageReplayer::on_start_fail_start(int r)
   m_threads->work_queue->queue(ctx, 0);
 }
 
-void ImageReplayer::on_start_fail_finish(int r)
+template <typename I>
+void ImageReplayer<I>::on_start_fail_finish(int r)
 {
   dout(20) << "r=" << r << dendl;
 
@@ -556,11 +389,7 @@ void ImageReplayer::on_start_fail_finish(int r)
   }
 
   if (m_local_image_ctx) {
-    bool owner;
-    if (librbd::is_exclusive_lock_owner(m_local_image_ctx, &owner) == 0 &&
-	owner) {
-      librbd::unlock(m_local_image_ctx, "");
-    }
+    // TODO: switch to async close via CloseImageRequest
     m_local_image_ctx->state->close();
     m_local_image_ctx = nullptr;
   }
@@ -568,74 +397,83 @@ void ImageReplayer::on_start_fail_finish(int r)
   m_local_ioctx.close();
   m_remote_ioctx.close();
 
-  Context *on_finish(nullptr);
+  delete m_asok_hook;
+  m_asok_hook = nullptr;
 
+  Context *on_start_finish(nullptr);
+  Context *on_stop_finish(nullptr);
   {
     Mutex::Locker locker(m_lock);
-    if (m_state == STATE_STOPPING) {
+    if (m_stop_requested) {
       assert(r == -EINTR);
       dout(20) << "start interrupted" << dendl;
       m_state = STATE_STOPPED;
+      m_stop_requested = false;
     } else {
       assert(m_state == STATE_STARTING);
       dout(20) << "start failed" << dendl;
       m_state = STATE_UNINITIALIZED;
     }
-    std::swap(m_on_finish, on_finish);
+    std::swap(m_on_start_finish, on_start_finish);
+    std::swap(m_on_stop_finish, on_stop_finish);
   }
 
-  if (on_finish) {
-    dout(20) << "on finish complete, r=" << r << dendl;
-    on_finish->complete(r);
+  if (on_start_finish != nullptr) {
+    dout(20) << "on start finish complete, r=" << r << dendl;
+    on_start_finish->complete(r);
+  }
+  if (on_stop_finish != nullptr) {
+    dout(20) << "on stop finish complete, r=" << r << dendl;
+    on_stop_finish->complete(0);
   }
 }
 
-bool ImageReplayer::on_start_interrupted()
+template <typename I>
+bool ImageReplayer<I>::on_start_interrupted()
 {
   Mutex::Locker locker(m_lock);
-
-  if (m_state == STATE_STARTING) {
+  assert(m_state == STATE_STARTING);
+  if (m_on_stop_finish == nullptr) {
     return false;
   }
 
-  assert(m_state == STATE_STOPPING);
-
   on_start_fail_start(-EINTR);
   return true;
 }
 
-void ImageReplayer::stop(Context *on_finish)
+template <typename I>
+void ImageReplayer<I>::stop(Context *on_finish)
 {
-  dout(20) << "on_finish=" << on_finish << ", m_on_finish=" << m_on_finish
-	   << dendl;
+  dout(20) << "on_finish=" << on_finish << dendl;
 
-  Mutex::Locker locker(m_lock);
-  assert(is_running_());
-
-  if (m_state == STATE_STARTING) {
-    dout(20) << "interrupting start" << dendl;
-
-    if (on_finish) {
-      Context *on_start_finish = m_on_finish;
-      FunctionContext *ctx = new FunctionContext(
-	[this, on_start_finish, on_finish](int r) {
-	  if (on_start_finish) {
-	    on_start_finish->complete(r);
-	  }
-	  on_finish->complete(0);
-	});
-
-      m_on_finish = ctx;
+  bool shut_down_replay = false;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(is_running_());
+
+    if (!is_stopped_()) {
+      if (m_state == STATE_STARTING) {
+        dout(20) << "interrupting start" << dendl;
+      } else {
+        dout(20) << "interrupting replay" << dendl;
+        shut_down_replay = true;
+      }
+
+      assert(m_on_stop_finish == nullptr);
+      std::swap(m_on_stop_finish, on_finish);
+      m_stop_requested = true;
     }
-  } else {
-    assert(m_on_finish == nullptr);
-    m_on_finish = on_finish;
+  }
+
+  if (shut_down_replay) {
     on_stop_journal_replay_shut_down_start();
+  } else if (on_finish != nullptr) {
+    on_finish->complete(0);
   }
-  m_state = STATE_STOPPING;
 }
 
-void ImageReplayer::on_stop_journal_replay_shut_down_start()
+template <typename I>
+void ImageReplayer<I>::on_stop_journal_replay_shut_down_start()
 {
   dout(20) << "enter" << dendl;
 
@@ -644,36 +482,53 @@ void ImageReplayer::on_stop_journal_replay_shut_down_start()
       on_stop_journal_replay_shut_down_finish(r);
     });
 
-  m_local_replay->shut_down(false, ctx);
+  {
+    Mutex::Locker locker(m_lock);
+
+    // as we complete in-flight records, we might receive multiple stop requests
+    if (m_state != STATE_REPLAYING) {
+      return;
+    }
+    m_state = STATE_STOPPING;
+    m_local_replay->shut_down(false, ctx);
+  }
 }
 
-void ImageReplayer::on_stop_journal_replay_shut_down_finish(int r)
+template <typename I>
+void ImageReplayer<I>::on_stop_journal_replay_shut_down_finish(int r)
 {
   dout(20) << "r=" << r << dendl;
-
   if (r < 0) {
     derr << "error flushing journal replay: " << cpp_strerror(r) << dendl;
   }
 
-  m_local_image_ctx->journal->stop_external_replay();
-  m_local_replay = nullptr;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_state == STATE_STOPPING);
+    m_local_image_ctx->journal->stop_external_replay();
+    m_local_replay = nullptr;
+    m_replay_entry = ReplayEntry();
+    m_replay_tag_valid = false;
+  }
 
   on_stop_local_image_close_start();
 }
 
-void ImageReplayer::on_stop_local_image_close_start()
+template <typename I>
+void ImageReplayer<I>::on_stop_local_image_close_start()
 {
   dout(20) << "enter" << dendl;
 
   // close and delete the image (from outside the image's thread context)
   Context *ctx = create_context_callback<
-    ImageReplayer, &ImageReplayer::on_stop_local_image_close_finish>(this);
-  CloseImageRequest<> *request = CloseImageRequest<>::create(
-    &m_local_image_ctx, m_threads->work_queue, ctx);
+    ImageReplayer, &ImageReplayer<I>::on_stop_local_image_close_finish>(this);
+  CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+    &m_local_image_ctx, m_threads->work_queue, false, ctx);
   request->send();
 }
 
-void ImageReplayer::on_stop_local_image_close_finish(int r)
+template <typename I>
+void ImageReplayer<I>::on_stop_local_image_close_finish(int r)
 {
   dout(20) << "r=" << r << dendl;
 
@@ -693,6 +548,9 @@ void ImageReplayer::on_stop_local_image_close_finish(int r)
 
   m_remote_ioctx.close();
 
+  delete m_asok_hook;
+  m_asok_hook = nullptr;
+
   Context *on_finish(nullptr);
 
   {
@@ -700,134 +558,312 @@ void ImageReplayer::on_stop_local_image_close_finish(int r)
     assert(m_state == STATE_STOPPING);
 
     m_state = STATE_STOPPED;
-
-    std::swap(m_on_finish, on_finish);
+    m_stop_requested = false;
+    std::swap(m_on_stop_finish, on_finish);
   }
 
   dout(20) << "stop complete" << dendl;
 
-  if (on_finish) {
+  if (on_finish != nullptr) {
     dout(20) << "on finish complete, r=" << r << dendl;
     on_finish->complete(r);
   }
 }
 
-void ImageReplayer::close_local_image(Context *on_finish)
+template <typename I>
+void ImageReplayer<I>::close_local_image(Context *on_finish)
 {
   m_local_image_ctx->state->close(on_finish);
 }
 
-void ImageReplayer::handle_replay_ready()
+template <typename I>
+void ImageReplayer<I>::handle_replay_ready()
 {
   dout(20) << "enter" << dendl;
+  if (on_replay_interrupted()) {
+    return;
+  }
 
-  ::journal::ReplayEntry replay_entry;
-  if (!m_remote_journaler->try_pop_front(&replay_entry)) {
+  if (!m_remote_journaler->try_pop_front(&m_replay_entry, &m_replay_tag_tid)) {
     return;
   }
 
-  dout(20) << "processing entry tid=" << replay_entry.get_commit_tid() << dendl;
+  if (m_replay_tag_valid && m_replay_tag.tid == m_replay_tag_tid) {
+    process_entry();
+    return;
+  }
 
-  bufferlist data = replay_entry.get_data();
-  bufferlist::iterator it = data.begin();
-  Context *on_ready = create_context_callback<
-    ImageReplayer, &ImageReplayer::handle_replay_process_ready>(this);
-  Context *on_commit = new C_ReplayCommitted(this, std::move(replay_entry));
-  m_local_replay->process(&it, on_ready, on_commit);
+  replay_flush();
 }
 
-int ImageReplayer::flush()
+template <typename I>
+void ImageReplayer<I>::flush(Context *on_finish)
 {
-  // TODO: provide async method
-
   dout(20) << "enter" << dendl;
 
   {
     Mutex::Locker locker(m_lock);
-
-    if (m_state != STATE_REPLAYING) {
-      return 0;
+    if (m_state == STATE_REPLAYING || m_state == STATE_REPLAYING) {
+      Context *ctx = new FunctionContext(
+        [on_finish](int r) {
+          if (on_finish != nullptr) {
+            on_finish->complete(r);
+          }
+        });
+      on_flush_local_replay_flush_start(ctx);
+      return;
     }
+  }
 
-    m_state = STATE_FLUSHING_REPLAY;
+  if (on_finish) {
+    on_finish->complete(0);
   }
+}
 
-  C_SaferCond replay_flush_ctx;
-  m_local_replay->flush(&replay_flush_ctx);
-  int r = replay_flush_ctx.wait();
+template <typename I>
+void ImageReplayer<I>::on_flush_local_replay_flush_start(Context *on_flush)
+{
+  dout(20) << "enter" << dendl;
+  FunctionContext *ctx = new FunctionContext(
+    [this, on_flush](int r) {
+      on_flush_local_replay_flush_finish(on_flush, r);
+    });
+
+  assert(m_lock.is_locked());
+  assert(m_state == STATE_REPLAYING);
+  m_local_replay->flush(ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::on_flush_local_replay_flush_finish(Context *on_flush,
+                                                          int r)
+{
+  dout(20) << "r=" << r << dendl;
   if (r < 0) {
     derr << "error flushing local replay: " << cpp_strerror(r) << dendl;
+    on_flush->complete(r);
+    return;
   }
 
-  C_SaferCond journaler_flush_ctx;
-  m_remote_journaler->flush_commit_position(&journaler_flush_ctx);
-  int r1 = journaler_flush_ctx.wait();
-  if (r1 < 0) {
+  on_flush_flush_commit_position_start(on_flush);
+}
+
+template <typename I>
+void ImageReplayer<I>::on_flush_flush_commit_position_start(Context *on_flush)
+{
+  FunctionContext *ctx = new FunctionContext(
+    [this, on_flush](int r) {
+      on_flush_flush_commit_position_finish(on_flush, r);
+    });
+
+  m_remote_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::on_flush_flush_commit_position_finish(Context *on_flush,
+                                                             int r)
+{
+  if (r < 0) {
     derr << "error flushing remote journal commit position: "
-	 << cpp_strerror(r1) << dendl;
+	 << cpp_strerror(r) << dendl;
   }
 
+  dout(20) << "flush complete, r=" << r << dendl;
+  on_flush->complete(r);
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_replay_interrupted()
+{
+  bool shut_down;
   {
     Mutex::Locker locker(m_lock);
-    assert(m_state == STATE_FLUSHING_REPLAY);
+    shut_down = m_stop_requested;
+  }
 
-    m_state = STATE_REPLAYING;
+  if (shut_down) {
+    on_stop_journal_replay_shut_down_start();
   }
+  return shut_down;
+}
 
-  dout(20) << "done" << dendl;
+template <typename I>
+void ImageReplayer<I>::print_status(Formatter *f, stringstream *ss)
+{
+  dout(20) << "enter" << dendl;
+
+  Mutex::Locker l(m_lock);
 
-  return r < 0 ? r : r1;
+  if (f) {
+    f->open_object_section("image_replayer");
+    f->dump_string("name", m_name);
+    f->dump_string("state", to_string(m_state));
+    f->close_section();
+    f->flush(*ss);
+  } else {
+    *ss << m_name << ": state: " << to_string(m_state);
+  }
 }
 
-void ImageReplayer::handle_replay_process_ready(int r)
+template <typename I>
+void ImageReplayer<I>::handle_replay_complete(int r)
 {
-  // journal::Replay is ready for more events -- attempt to pop another
+  dout(20) << "r=" << r << dendl;
+  if (r < 0) {
+    derr << "replay encountered an error: " << cpp_strerror(r) << dendl;
+  }
 
-  dout(20) << "enter" << dendl;
+  {
+    Mutex::Locker locker(m_lock);
+    m_stop_requested = true;
+  }
+  on_replay_interrupted();
+}
+
+template <typename I>
+void ImageReplayer<I>::replay_flush() {
+  dout(20) << dendl;
+
+  Context *ctx = create_context_callback<
+    ImageReplayer<I>, &ImageReplayer<I>::handle_replay_flush>(this);
+  flush(ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_replay_flush(int r) {
+  dout(20) << "r=" << r << dendl;
 
   if (r < 0) {
-    derr << "error replaying journal entry: " << cpp_strerror(r)
-	 << dendl;
-    // TODO: handle error
+    derr << "replay flush encountered an error: " << cpp_strerror(r) << dendl;
+    handle_replay_complete(r);
+    return;
   }
 
-  assert(r == 0);
-  handle_replay_ready();
+  get_remote_tag();
 }
 
-void ImageReplayer::handle_replay_complete(int r)
-{
-  dout(20) "r=" << r << dendl;
+template <typename I>
+void ImageReplayer<I>::get_remote_tag() {
+  dout(20) << "tag_tid: " << m_replay_tag_tid << dendl;
 
-  //m_remote_journaler->stop_replay();
+  Context *ctx = create_context_callback<
+    ImageReplayer, &ImageReplayer<I>::handle_get_remote_tag>(this);
+  m_remote_journaler->get_tag(m_replay_tag_tid, &m_replay_tag, ctx);
 }
 
-void ImageReplayer::handle_replay_committed(
-  ::journal::ReplayEntry *replay_entry, int r)
-{
-  dout(20) << "commit_tid=" << replay_entry->get_commit_tid() << ", r=" << r
-	   << dendl;
+template <typename I>
+void ImageReplayer<I>::handle_get_remote_tag(int r) {
+  dout(20) << "r=" << r << dendl;
 
-  m_remote_journaler->committed(*replay_entry);
+  if (r == 0) {
+    try {
+      bufferlist::iterator it = m_replay_tag.data.begin();
+      ::decode(m_replay_tag_data, it);
+    } catch (const buffer::error &err) {
+      r = -EBADMSG;
+    }
+  }
+
+  if (r < 0) {
+    derr << "failed to retrieve remote tag " << m_replay_tag_tid << ": "
+         << cpp_strerror(r) << dendl;
+    handle_replay_complete(r);
+    return;
+  }
+
+  m_replay_tag_valid = true;
+  dout(20) << "decoded remote tag " << m_replay_tag_tid << ": "
+           << m_replay_tag_data << dendl;
+
+  allocate_local_tag();
 }
 
-int ImageReplayer::get_bootstrap_params(BootstrapParams *params)
-{
-  int r = librbd::cls_client::dir_get_name(&m_remote_ioctx, RBD_DIRECTORY,
-					   m_remote_image_id,
-					   &params->local_image_name);
+template <typename I>
+void ImageReplayer<I>::allocate_local_tag() {
+  dout(20) << dendl;
+
+  std::string mirror_uuid = m_replay_tag_data.mirror_uuid;
+  if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID ||
+      mirror_uuid == m_local_mirror_uuid) {
+    mirror_uuid = m_remote_mirror_uuid;
+  } else if (mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+    dout(5) << "encountered image demotion: stopping" << dendl;
+    Mutex::Locker locker(m_lock);
+    m_stop_requested = true;
+  }
+
+  std::string predecessor_mirror_uuid =
+    m_replay_tag_data.predecessor_mirror_uuid;
+  if (predecessor_mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+    predecessor_mirror_uuid = m_remote_mirror_uuid;
+  } else if (predecessor_mirror_uuid == m_local_mirror_uuid) {
+    predecessor_mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
+  }
+
+  Context *ctx = create_context_callback<
+    ImageReplayer, &ImageReplayer<I>::handle_allocate_local_tag>(this);
+  m_local_image_ctx->journal->allocate_tag(
+    mirror_uuid, predecessor_mirror_uuid,
+    m_replay_tag_data.predecessor_commit_valid,
+    m_replay_tag_data.predecessor_tag_tid,
+    m_replay_tag_data.predecessor_entry_tid,
+    ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_allocate_local_tag(int r) {
+  dout(20) << "r=" << r << dendl;
+
   if (r < 0) {
-    derr << "error looking up name for remote image id " << m_remote_image_id
-	 << ": " << cpp_strerror(r) << dendl;
-    return r;
+    derr << "failed to allocate journal tag: " << cpp_strerror(r) << dendl;
+    handle_replay_complete(r);
+    return;
   }
 
-  params->local_pool_name = m_remote_ioctx.get_pool_name();
+  process_entry();
+}
+
+template <typename I>
+void ImageReplayer<I>::process_entry() {
+  dout(20) << "processing entry tid=" << m_replay_entry.get_commit_tid()
+           << dendl;
+
+  bufferlist data = m_replay_entry.get_data();
+  bufferlist::iterator it = data.begin();
 
-  return 0;
+  Context *on_ready = create_context_callback<
+    ImageReplayer, &ImageReplayer<I>::handle_process_entry_ready>(this);
+  Context *on_commit = new C_ReplayCommitted(this, std::move(m_replay_entry));
+  m_local_replay->process(&it, on_ready, on_commit);
 }
 
-void ImageReplayer::shut_down_journal_replay(bool cancel_ops)
+template <typename I>
+void ImageReplayer<I>::handle_process_entry_ready(int r) {
+  dout(20) << dendl;
+  assert(r == 0);
+
+  // attempt to process the next event
+  handle_replay_ready();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_process_entry_safe(const ReplayEntry& replay_entry,
+                                                 int r) {
+  dout(20) << "commit_tid=" << replay_entry.get_commit_tid() << ", r=" << r
+	   << dendl;
+
+  if (r < 0) {
+    derr << "failed to commit journal event: " << cpp_strerror(r) << dendl;
+
+    handle_replay_complete(r);
+    return;
+  }
+
+  m_remote_journaler->committed(replay_entry);
+}
+
+template <typename I>
+void ImageReplayer<I>::shut_down_journal_replay(bool cancel_ops)
 {
   C_SaferCond cond;
   m_local_replay->shut_down(cancel_ops, &cond);
@@ -837,40 +873,34 @@ void ImageReplayer::shut_down_journal_replay(bool cancel_ops)
   }
 }
 
-std::ostream &operator<<(std::ostream &os, const ImageReplayer::State &state)
-{
+template <typename I>
+std::string ImageReplayer<I>::to_string(const State state) {
   switch (state) {
-  case ImageReplayer::STATE_UNINITIALIZED:
-    os << "Uninitialized";
-    break;
-  case ImageReplayer::STATE_STARTING:
-    os << "Starting";
-    break;
-  case ImageReplayer::STATE_REPLAYING:
-    os << "Replaying";
-    break;
-  case ImageReplayer::STATE_FLUSHING_REPLAY:
-    os << "FlushingReplay";
-    break;
-  case ImageReplayer::STATE_STOPPING:
-    os << "Stopping";
-    break;
-  case ImageReplayer::STATE_STOPPED:
-    os << "Stopped";
-    break;
+  case ImageReplayer<I>::STATE_UNINITIALIZED:
+    return "Uninitialized";
+  case ImageReplayer<I>::STATE_STARTING:
+    return "Starting";
+  case ImageReplayer<I>::STATE_REPLAYING:
+    return "Replaying";
+  case ImageReplayer<I>::STATE_STOPPING:
+    return "Stopping";
+  case ImageReplayer<I>::STATE_STOPPED:
+    return "Stopped";
   default:
-    os << "Unknown(" << state << ")";
     break;
   }
-  return os;
+  return "Unknown(" + stringify(state) + ")";
 }
 
-std::ostream &operator<<(std::ostream &os, const ImageReplayer &replayer)
+template <typename I>
+std::ostream &operator<<(std::ostream &os, const ImageReplayer<I> &replayer)
 {
-  os << "ImageReplayer[" << replayer.m_remote_pool_id << "/"
-     << replayer.m_remote_image_id << "]";
+  os << "ImageReplayer[" << replayer.get_remote_pool_id() << "/"
+     << replayer.get_remote_image_id() << "]";
   return os;
 }
 
 } // namespace mirror
 } // namespace rbd
+
+template class rbd::mirror::ImageReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageReplayer.h b/src/tools/rbd_mirror/ImageReplayer.h
index 9dc26e1..17315ad 100644
--- a/src/tools/rbd_mirror/ImageReplayer.h
+++ b/src/tools/rbd_mirror/ImageReplayer.h
@@ -12,67 +12,65 @@
 #include "common/WorkQueue.h"
 #include "include/rados/librados.hpp"
 #include "cls/journal/cls_journal_types.h"
+#include "journal/ReplayEntry.h"
 #include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
 #include "types.h"
 
+class AdminSocketHook;
+
 namespace journal {
 
 class Journaler;
 class ReplayHandler;
-class ReplayEntry;
 
 }
 
 namespace librbd {
 
 class ImageCtx;
-
-namespace journal {
-
-template <typename> class Replay;
-
-}
+namespace journal { template <typename> class Replay; }
 
 }
 
 namespace rbd {
 namespace mirror {
 
-class ImageReplayerAdminSocketHook;
 struct Threads;
 
 /**
  * Replays changes from a remote cluster for a single image.
  */
+template <typename ImageCtxT = librbd::ImageCtx>
 class ImageReplayer {
 public:
+  typedef typename librbd::journal::TypeTraits<ImageCtxT>::ReplayEntry ReplayEntry;
+
   enum State {
     STATE_UNINITIALIZED,
     STATE_STARTING,
     STATE_REPLAYING,
-    STATE_FLUSHING_REPLAY,
     STATE_STOPPING,
     STATE_STOPPED,
   };
 
   struct BootstrapParams {
-    std::string local_pool_name;
     std::string local_image_name;
 
     BootstrapParams() {}
-    BootstrapParams(const std::string &local_pool_name,
-		    const std::string local_image_name) :
-      local_pool_name(local_pool_name),
+    BootstrapParams(const std::string local_image_name) :
       local_image_name(local_image_name) {}
 
     bool empty() const {
-      return local_pool_name.empty() && local_image_name.empty();
+      return local_image_name.empty();
     }
   };
 
   ImageReplayer(Threads *threads, RadosRef local, RadosRef remote,
-		const std::string &client_id, int64_t local_pool_id,
-		int64_t remote_pool_id, const std::string &remote_image_id);
+		const std::string &local_mirror_uuid,
+                const std::string &remote_mirror_uuid, int64_t local_pool_id,
+		int64_t remote_pool_id, const std::string &remote_image_id,
+                const std::string &global_image_id);
   virtual ~ImageReplayer();
   ImageReplayer(const ImageReplayer&) = delete;
   ImageReplayer& operator=(const ImageReplayer&) = delete;
@@ -81,52 +79,73 @@ public:
   bool is_stopped() { Mutex::Locker l(m_lock); return is_stopped_(); }
   bool is_running() { Mutex::Locker l(m_lock); return is_running_(); }
 
+  std::string get_name() { Mutex::Locker l(m_lock); return m_name; };
+
   void start(Context *on_finish = nullptr,
 	     const BootstrapParams *bootstrap_params = nullptr);
   void stop(Context *on_finish = nullptr);
-  int flush();
+  void flush(Context *on_finish = nullptr);
+
+  void print_status(Formatter *f, stringstream *ss);
 
   virtual void handle_replay_ready();
-  virtual void handle_replay_process_ready(int r);
   virtual void handle_replay_complete(int r);
 
-  virtual void handle_replay_committed(::journal::ReplayEntry* replay_entry, int r);
-
+  inline int64_t get_remote_pool_id() const {
+    return m_remote_pool_id;
+  }
+  inline const std::string get_remote_image_id() const {
+    return m_remote_image_id;
+  }
 protected:
   /**
    * @verbatim
    *                   (error)
-   * <uninitialized> <------------------------ FAIL
-   *    |                                       ^
-   *    v                                       *
-   * <starting>                                 *
-   *    |                                       *
-   *    v                               (error) *
-   * GET_REGISTERED_CLIENT_STATUS * * * * * * * *
-   *    |                                       *
-   *    | (sync required)                       *
-   *    |\-----\                                *
-   *    |      |                                *
-   *    |      v                                *
-   *    |   BOOTSTRAP_IMAGE * * * * * * * * * * *
-   *    |      |                                *
-   *    |      v                                *
-   *    |/-----/                                *
-   *    |                                       *
-   *    v (no sync required)            (error) *
-   * REMOTE_JOURNALER_INIT  * * * * * * * * * * *
-   *    |                                       *
-   *    v                               (error) *
-   * LOCAL_IMAGE_OPEN (skip if not              *
-   *    |              needed                   *
-   *    v                               (error) *
-   * WAIT_FOR_LOCAL_JOURNAL_READY * * * * * * * *
+   * <uninitialized> <------------------------------------ FAIL
+   *    |                                                   ^
+   *    v                                                   *
+   * <starting>                                             *
+   *    |                                                   *
+   *    v                                           (error) *
+   * BOOTSTRAP_IMAGE  * * * * * * * * * * * * * * * * * * * *
+   *    |                                                   *
+   *    v                                           (error) *
+   * INIT_REMOTE_JOURNALER  * * * * * * * * * * * * * * * * *
+   *    |                                                   *
+   *    v                                           (error) *
+   * START_REPLAY * * * * * * * * * * * * * * * * * * * * * *
    *    |
-   *    v
-   * <replaying>
-   *    |
-   *    v
-   * <stopping>
+   *    |  /--------------------------------------------\
+   *    |  |                                            |
+   *    v  v   (asok flush)                             |
+   * REPLAYING -------------> LOCAL_REPLAY_FLUSH        |
+   *    |       \                 |                     |
+   *    |       |                 v                     |
+   *    |       |             FLUSH_COMMIT_POSITION     |
+   *    |       |                 |                     |
+   *    |       |                 \--------------------/|
+   *    |       |                                       |
+   *    |       | (entries available)                   |
+   *    |       \-----------> REPLAY_READY              |
+   *    |                         |                     |
+   *    |                         | (skip if not        |
+   *    |                         v  needed)        (error)
+   *    |                     REPLAY_FLUSH  * * * * * * * * *
+   *    |                         |                     |   *
+   *    |                         | (skip if not        |   *
+   *    |                         v  needed)        (error) *
+   *    |                     GET_REMOTE_TAG  * * * * * * * *
+   *    |                         |                     |   *
+   *    |                         | (skip if not        |   *
+   *    |                         v  needed)        (error) *
+   *    |                     ALLOCATE_LOCAL_TAG  * * * * * *
+   *    |                         |                     |   *
+   *    |                         v                 (error) *
+   *    |                     PROCESS_ENTRY * * * * * * * * *
+   *    |                         |                     |   *
+   *    |                         \---------------------/   *
+   *    v                                                   *
+   * REPLAY_COMPLETE  < * * * * * * * * * * * * * * * * * * *
    *    |
    *    v
    * JOURNAL_REPLAY_SHUT_DOWN
@@ -140,21 +159,6 @@ protected:
    * @endverbatim
    */
 
-  virtual void on_start_get_registered_client_status_start(
-    const BootstrapParams *bootstrap_params);
-  virtual void on_start_get_registered_client_status_finish(int r,
-    const std::set<cls::journal::Client> &registered_clients,
-    const BootstrapParams &bootstrap_params);
-
-  void bootstrap(const BootstrapParams &params);
-  void handle_bootstrap(int r);
-
-  virtual void on_start_remote_journaler_init_start();
-  virtual void on_start_remote_journaler_init_finish(int r);
-  virtual void on_start_local_image_open_start();
-  virtual void on_start_local_image_open_finish(int r);
-  virtual void on_start_wait_for_local_journal_ready_start();
-  virtual void on_start_wait_for_local_journal_ready_finish(int r);
   virtual void on_start_fail_start(int r);
   virtual void on_start_fail_finish(int r);
   virtual bool on_start_interrupted();
@@ -164,41 +168,96 @@ protected:
   virtual void on_stop_local_image_close_start();
   virtual void on_stop_local_image_close_finish(int r);
 
-  void close_local_image(Context *on_finish); // for tests
+  virtual void on_flush_local_replay_flush_start(Context *on_flush);
+  virtual void on_flush_local_replay_flush_finish(Context *on_flush, int r);
+  virtual void on_flush_flush_commit_position_start(Context *on_flush);
+  virtual void on_flush_flush_commit_position_finish(Context *on_flush, int r);
 
-private:
-  State get_state_() const { return m_state; }
-  bool is_stopped_() const { return m_state == STATE_UNINITIALIZED ||
-                                    m_state == STATE_STOPPED; }
-  bool is_running_() const { return !is_stopped_() && m_state != STATE_STOPPING; }
-
-  int get_bootstrap_params(BootstrapParams *params);
+  bool on_replay_interrupted();
 
-  void shut_down_journal_replay(bool cancel_ops);
+  void close_local_image(Context *on_finish); // for tests
 
-  friend std::ostream &operator<<(std::ostream &os,
-				  const ImageReplayer &replayer);
+private:
+  typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler;
 
   Threads *m_threads;
   RadosRef m_local, m_remote;
-  std::string m_client_id;
+  std::string m_local_mirror_uuid;
+  std::string m_remote_mirror_uuid;
   int64_t m_remote_pool_id, m_local_pool_id;
-  std::string m_remote_image_id, m_local_image_id;
+  std::string m_remote_image_id, m_local_image_id, m_global_image_id;
+  std::string m_local_image_name;
+  std::string m_name;
   Mutex m_lock;
   State m_state;
-  std::string m_local_pool_name, m_remote_pool_name;
   librados::IoCtx m_local_ioctx, m_remote_ioctx;
-  librbd::ImageCtx *m_local_image_ctx;
-  librbd::journal::Replay<librbd::ImageCtx> *m_local_replay;
-  ::journal::Journaler *m_remote_journaler;
+  ImageCtxT *m_local_image_ctx;
+  librbd::journal::Replay<ImageCtxT> *m_local_replay;
+  Journaler* m_remote_journaler;
   ::journal::ReplayHandler *m_replay_handler;
-  Context *m_on_finish;
-  ImageReplayerAdminSocketHook *m_asok_hook;
+
+  Context *m_on_start_finish = nullptr;
+  Context *m_on_stop_finish = nullptr;
+  bool m_stop_requested = false;
+
+  AdminSocketHook *m_asok_hook;
 
   librbd::journal::MirrorPeerClientMeta m_client_meta;
+
+  ReplayEntry m_replay_entry;
+  bool m_replay_tag_valid = false;
+  uint64_t m_replay_tag_tid = 0;
+  cls::journal::Tag m_replay_tag;
+  librbd::journal::TagData m_replay_tag_data;
+
+  struct C_ReplayCommitted : public Context {
+    ImageReplayer *replayer;
+    ReplayEntry replay_entry;
+
+    C_ReplayCommitted(ImageReplayer *replayer,
+                      ReplayEntry &&replay_entry)
+      : replayer(replayer), replay_entry(std::move(replay_entry)) {
+    }
+    virtual void finish(int r) {
+      replayer->handle_process_entry_safe(replay_entry, r);
+    }
+  };
+
+  static std::string to_string(const State state);
+
+  State get_state_() const { return m_state; }
+  bool is_stopped_() const { return m_state == STATE_UNINITIALIZED ||
+                                    m_state == STATE_STOPPED; }
+  bool is_running_() const { return !is_stopped_() && m_state != STATE_STOPPING; }
+
+  void shut_down_journal_replay(bool cancel_ops);
+
+  void bootstrap();
+  void handle_bootstrap(int r);
+
+  void init_remote_journaler();
+  void handle_init_remote_journaler(int r);
+
+  void start_replay();
+
+  void replay_flush();
+  void handle_replay_flush(int r);
+
+  void get_remote_tag();
+  void handle_get_remote_tag(int r);
+
+  void allocate_local_tag();
+  void handle_allocate_local_tag(int r);
+
+  void process_entry();
+  void handle_process_entry_ready(int r);
+  void handle_process_entry_safe(const ReplayEntry& replay_entry, int r);
+
 };
 
 } // namespace mirror
 } // namespace rbd
 
+extern template class rbd::mirror::ImageReplayer<librbd::ImageCtx>;
+
 #endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
diff --git a/src/tools/rbd_mirror/ImageSync.h b/src/tools/rbd_mirror/ImageSync.h
index 175cb78..1ed2256 100644
--- a/src/tools/rbd_mirror/ImageSync.h
+++ b/src/tools/rbd_mirror/ImageSync.h
@@ -6,7 +6,7 @@
 
 #include "include/int_types.h"
 #include "librbd/ImageCtx.h"
-#include "librbd/Journal.h"
+#include "librbd/journal/TypeTraits.h"
 #include "common/Mutex.h"
 #include <map>
 #include <vector>
@@ -29,6 +29,16 @@ public:
   typedef typename TypeTraits::Journaler Journaler;
   typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
 
+  static ImageSync* create(ImageCtxT *local_image_ctx,
+                           ImageCtxT *remote_image_ctx, SafeTimer *timer,
+                           Mutex *timer_lock, const std::string &mirror_uuid,
+                           Journaler *journaler,
+                           MirrorPeerClientMeta *client_meta,
+                           Context *on_finish) {
+    return new ImageSync(local_image_ctx, remote_image_ctx, timer, timer_lock,
+                         mirror_uuid, journaler, client_meta, on_finish);
+  }
+
   ImageSync(ImageCtxT *local_image_ctx, ImageCtxT *remote_image_ctx,
             SafeTimer *timer, Mutex *timer_lock, const std::string &mirror_uuid,
             Journaler *journaler, MirrorPeerClientMeta *client_meta,
diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc
index 2ce0177..5dd59be 100644
--- a/src/tools/rbd_mirror/Mirror.cc
+++ b/src/tools/rbd_mirror/Mirror.cc
@@ -3,6 +3,8 @@
 
 #include <boost/range/adaptor/map.hpp>
 
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
 #include "common/debug.h"
 #include "common/errno.h"
 #include "Mirror.h"
@@ -27,16 +29,107 @@ using librbd::mirror_peer_t;
 namespace rbd {
 namespace mirror {
 
+namespace {
+
+class MirrorAdminSocketCommand {
+public:
+  virtual ~MirrorAdminSocketCommand() {}
+  virtual bool call(Formatter *f, stringstream *ss) = 0;
+};
+
+class StatusCommand : public MirrorAdminSocketCommand {
+public:
+  explicit StatusCommand(Mirror *mirror) : mirror(mirror) {}
+
+  bool call(Formatter *f, stringstream *ss) {
+    mirror->print_status(f, ss);
+    return true;
+  }
+
+private:
+  Mirror *mirror;
+};
+
+class FlushCommand : public MirrorAdminSocketCommand {
+public:
+  explicit FlushCommand(Mirror *mirror) : mirror(mirror) {}
+
+  bool call(Formatter *f, stringstream *ss) {
+    mirror->flush();
+    return true;
+  }
+
+private:
+  Mirror *mirror;
+};
+
+} // anonymous namespace
+
+class MirrorAdminSocketHook : public AdminSocketHook {
+public:
+  MirrorAdminSocketHook(CephContext *cct, Mirror *mirror) :
+    admin_socket(cct->get_admin_socket()) {
+    std::string command;
+    int r;
+
+    command = "rbd mirror status";
+    r = admin_socket->register_command(command, command, this,
+				       "get status for rbd mirror");
+    if (r == 0) {
+      commands[command] = new StatusCommand(mirror);
+    }
+
+    command = "rbd mirror flush";
+    r = admin_socket->register_command(command, command, this,
+				       "flush rbd mirror");
+    if (r == 0) {
+      commands[command] = new FlushCommand(mirror);
+    }
+  }
+
+  ~MirrorAdminSocketHook() {
+    for (Commands::const_iterator i = commands.begin(); i != commands.end();
+	 ++i) {
+      (void)admin_socket->unregister_command(i->first);
+      delete i->second;
+    }
+  }
+
+  bool call(std::string command, cmdmap_t& cmdmap, std::string format,
+	    bufferlist& out) {
+    Commands::const_iterator i = commands.find(command);
+    assert(i != commands.end());
+    Formatter *f = Formatter::create(format);
+    stringstream ss;
+    bool r = i->second->call(f, &ss);
+    delete f;
+    out.append(ss);
+    return r;
+  }
+
+private:
+  typedef std::map<std::string, MirrorAdminSocketCommand*> Commands;
+
+  AdminSocket *admin_socket;
+  Commands commands;
+};
+
 Mirror::Mirror(CephContext *cct, const std::vector<const char*> &args) :
   m_cct(cct),
   m_args(args),
   m_lock("rbd::mirror::Mirror"),
-  m_local(new librados::Rados())
+  m_local(new librados::Rados()),
+  m_asok_hook(new MirrorAdminSocketHook(cct, this))
 {
   cct->lookup_or_create_singleton_object<Threads>(m_threads,
                                                   "rbd_mirror::threads");
 }
 
+Mirror::~Mirror()
+{
+  delete m_asok_hook;
+}
+
 void Mirror::handle_signal(int signum)
 {
   m_stopping.set(1);
@@ -79,6 +172,48 @@ void Mirror::run()
   dout(20) << "return" << dendl;
 }
 
+void Mirror::print_status(Formatter *f, stringstream *ss)
+{
+  dout(20) << "enter" << dendl;
+
+  Mutex::Locker l(m_lock);
+
+  if (m_stopping.read()) {
+    return;
+  }
+
+  if (f) {
+    f->open_object_section("mirror_status");
+    f->open_array_section("replayers");
+  };
+
+  for (auto it = m_replayers.begin(); it != m_replayers.end(); it++) {
+    auto &replayer = it->second;
+    replayer->print_status(f, ss);
+  }
+
+  if (f) {
+    f->close_section();
+    f->close_section();
+    f->flush(*ss);
+  }
+}
+
+void Mirror::flush()
+{
+  dout(20) << "enter" << dendl;
+  Mutex::Locker l(m_lock);
+
+  if (m_stopping.read()) {
+    return;
+  }
+
+  for (auto it = m_replayers.begin(); it != m_replayers.end(); it++) {
+    auto &replayer = it->second;
+    replayer->flush();
+  }
+}
+
 void Mirror::update_replayers(const map<peer_t, set<int64_t> > &peer_configs)
 {
   dout(20) << "enter" << dendl;
diff --git a/src/tools/rbd_mirror/Mirror.h b/src/tools/rbd_mirror/Mirror.h
index 6b6cc97..a23c448 100644
--- a/src/tools/rbd_mirror/Mirror.h
+++ b/src/tools/rbd_mirror/Mirror.h
@@ -20,6 +20,7 @@ namespace rbd {
 namespace mirror {
 
 struct Threads;
+class MirrorAdminSocketHook;
 
 /**
  * Contains the main loop and overall state for rbd-mirror.
@@ -32,11 +33,15 @@ public:
   Mirror(CephContext *cct, const std::vector<const char*> &args);
   Mirror(const Mirror&) = delete;
   Mirror& operator=(const Mirror&) = delete;
+  ~Mirror();
 
   int init();
   void run();
   void handle_signal(int signum);
 
+  void print_status(Formatter *f, stringstream *ss);
+  void flush();
+
 private:
   void refresh_peers(const set<peer_t> &peers);
   void update_replayers(const map<peer_t, set<int64_t> > &peer_configs);
@@ -52,6 +57,7 @@ private:
   std::unique_ptr<ClusterWatcher> m_local_cluster_watcher;
   std::map<peer_t, std::unique_ptr<Replayer> > m_replayers;
   atomic_t m_stopping;
+  MirrorAdminSocketHook *m_asok_hook;
 };
 
 } // namespace mirror
diff --git a/src/tools/rbd_mirror/PoolWatcher.cc b/src/tools/rbd_mirror/PoolWatcher.cc
index 74b54cd..0117cab 100644
--- a/src/tools/rbd_mirror/PoolWatcher.cc
+++ b/src/tools/rbd_mirror/PoolWatcher.cc
@@ -17,8 +17,6 @@
 #define dout_prefix *_dout << "rbd-mirror: PoolWatcher::" << __func__ << ": "
 
 using std::list;
-using std::map;
-using std::set;
 using std::string;
 using std::unique_ptr;
 using std::vector;
@@ -49,7 +47,7 @@ PoolWatcher::~PoolWatcher()
   m_timer.shutdown();
 }
 
-const map<int64_t, set<string> >& PoolWatcher::get_images() const
+const PoolWatcher::PoolImageIds& PoolWatcher::get_images() const
 {
   assert(m_lock.is_locked());
   return m_images;
@@ -58,7 +56,7 @@ const map<int64_t, set<string> >& PoolWatcher::get_images() const
 void PoolWatcher::refresh_images(bool reschedule)
 {
   dout(20) << "enter" << dendl;
-  map<int64_t, set<string> > images;
+  PoolImageIds images;
   list<pair<int64_t, string> > pools;
   int r = m_cluster->pool_list2(pools);
   if (r < 0) {
@@ -105,19 +103,28 @@ void PoolWatcher::refresh_images(bool reschedule)
       continue;
     }
 
-    // only format 2 images can be mirrored, so only check the format
-    // 2 rbd_directory structure
-    std::vector<std::string> image_ids;
-    r = mirror_image_list(&ioctx, &image_ids);
-    if (r < 0) {
-      derr << "error listing mirrored images in pool " << pool_name << ": "
-           << cpp_strerror(r) << dendl;
-      continue;
-    }
+    std::set<ImageIds> image_ids;
+    std::string last_read = "";
+    int max_read = 1024;
+    do {
+      std::map<std::string, std::string> mirror_images;
+      r =  mirror_image_list(&ioctx, last_read, max_read, &mirror_images);
+      if (r < 0) {
+        derr << "error listing mirrored image directory: "
+             << cpp_strerror(r) << dendl;
+        continue;
+      }
+      for (auto it = mirror_images.begin(); it != mirror_images.end(); ++it) {
+        image_ids.insert(ImageIds(it->first, it->second));
+      }
+      if (!mirror_images.empty()) {
+        last_read = mirror_images.rbegin()->first;
+      }
+      r = mirror_images.size();
+    } while (r == max_read);
 
     if (!image_ids.empty()) {
-      std::set<std::string> image_set(image_ids.begin(), image_ids.end());
-      images[pool_id] = std::move(image_set);
+      images[pool_id] = std::move(image_ids);
     }
   }
 
diff --git a/src/tools/rbd_mirror/PoolWatcher.h b/src/tools/rbd_mirror/PoolWatcher.h
index 1358539..0ab45b4 100644
--- a/src/tools/rbd_mirror/PoolWatcher.h
+++ b/src/tools/rbd_mirror/PoolWatcher.h
@@ -24,12 +24,30 @@ namespace mirror {
  */
 class PoolWatcher {
 public:
+  struct ImageIds {
+    std::string id;
+    std::string global_id;
+
+    ImageIds(const std::string &id, const std::string &global_id = "")
+      : id(id), global_id(global_id) {
+    }
+
+    inline bool operator==(const ImageIds &rhs) const {
+      return (id == rhs.id && global_id == rhs.global_id);
+    }
+    inline bool operator<(const ImageIds &rhs) const {
+      return id < rhs.id;
+    }
+  };
+  typedef std::map<int64_t, std::set<ImageIds> > PoolImageIds;
+
   PoolWatcher(RadosRef cluster, double interval_seconds,
 	      Mutex &lock, Cond &cond);
   ~PoolWatcher();
   PoolWatcher(const PoolWatcher&) = delete;
   PoolWatcher& operator=(const PoolWatcher&) = delete;
-  const std::map<int64_t, std::set<std::string> >& get_images() const;
+
+  const PoolImageIds& get_images() const;
   void refresh_images(bool reschedule=true);
 
 private:
@@ -40,8 +58,8 @@ private:
   RadosRef m_cluster;
   SafeTimer m_timer;
   double m_interval;
-  // pool id -> image id
-  std::map<int64_t, std::set<std::string> > m_images;
+
+  PoolImageIds m_images;
 };
 
 } // namespace mirror
diff --git a/src/tools/rbd_mirror/Replayer.cc b/src/tools/rbd_mirror/Replayer.cc
index 620a6a8..ce76f99 100644
--- a/src/tools/rbd_mirror/Replayer.cc
+++ b/src/tools/rbd_mirror/Replayer.cc
@@ -3,9 +3,12 @@
 
 #include <boost/bind.hpp>
 
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
 #include "common/debug.h"
 #include "common/errno.h"
 #include "include/stringify.h"
+#include "cls/rbd/cls_rbd_client.h"
 #include "Replayer.h"
 
 #define dout_subsys ceph_subsys_rbd_mirror
@@ -21,6 +24,92 @@ using std::vector;
 namespace rbd {
 namespace mirror {
 
+namespace {
+
+class ReplayerAdminSocketCommand {
+public:
+  virtual ~ReplayerAdminSocketCommand() {}
+  virtual bool call(Formatter *f, stringstream *ss) = 0;
+};
+
+class StatusCommand : public ReplayerAdminSocketCommand {
+public:
+  explicit StatusCommand(Replayer *replayer) : replayer(replayer) {}
+
+  bool call(Formatter *f, stringstream *ss) {
+    replayer->print_status(f, ss);
+    return true;
+  }
+
+private:
+  Replayer *replayer;
+};
+
+class FlushCommand : public ReplayerAdminSocketCommand {
+public:
+  explicit FlushCommand(Replayer *replayer) : replayer(replayer) {}
+
+  bool call(Formatter *f, stringstream *ss) {
+    replayer->flush();
+    return true;
+  }
+
+private:
+  Replayer *replayer;
+};
+
+} // anonymous namespace
+
+class ReplayerAdminSocketHook : public AdminSocketHook {
+public:
+  ReplayerAdminSocketHook(CephContext *cct, const std::string &name,
+			  Replayer *replayer) :
+    admin_socket(cct->get_admin_socket()) {
+    std::string command;
+    int r;
+
+    command = "rbd mirror status " + name;
+    r = admin_socket->register_command(command, command, this,
+				       "get status for rbd mirror " + name);
+    if (r == 0) {
+      commands[command] = new StatusCommand(replayer);
+    }
+
+    command = "rbd mirror flush " + name;
+    r = admin_socket->register_command(command, command, this,
+				       "flush rbd mirror " + name);
+    if (r == 0) {
+      commands[command] = new FlushCommand(replayer);
+    }
+  }
+
+  ~ReplayerAdminSocketHook() {
+    for (Commands::const_iterator i = commands.begin(); i != commands.end();
+	 ++i) {
+      (void)admin_socket->unregister_command(i->first);
+      delete i->second;
+    }
+  }
+
+  bool call(std::string command, cmdmap_t& cmdmap, std::string format,
+	    bufferlist& out) {
+    Commands::const_iterator i = commands.find(command);
+    assert(i != commands.end());
+    Formatter *f = Formatter::create(format);
+    stringstream ss;
+    bool r = i->second->call(f, &ss);
+    delete f;
+    out.append(ss);
+    return r;
+  }
+
+private:
+  typedef std::map<std::string, ReplayerAdminSocketCommand*> Commands;
+
+  AdminSocket *admin_socket;
+  Commands commands;
+};
+
 Replayer::Replayer(Threads *threads, RadosRef local_cluster,
                    const peer_t &peer, const std::vector<const char*> &args) :
   m_threads(threads),
@@ -29,12 +118,17 @@ Replayer::Replayer(Threads *threads, RadosRef local_cluster,
   m_args(args),
   m_local(local_cluster),
   m_remote(new librados::Rados),
+  m_asok_hook(nullptr),
   m_replayer_thread(this)
 {
+  CephContext *cct = static_cast<CephContext *>(m_local->cct());
+  m_asok_hook = new ReplayerAdminSocketHook(cct, m_peer.cluster_name, this);
 }
 
 Replayer::~Replayer()
 {
+  delete m_asok_hook;
+
   m_stopping.set(1);
   {
     Mutex::Locker l(m_lock);
@@ -89,15 +183,6 @@ int Replayer::init()
 
   dout(20) << "connected to " << m_peer << dendl;
 
-  std::string uuid;
-  r = m_local->cluster_fsid(&uuid);
-  if (r < 0) {
-    derr << "error retrieving local cluster uuid: " << cpp_strerror(r)
-	 << dendl;
-    return r;
-  }
-  m_client_id = uuid;
-
   // TODO: make interval configurable
   m_pool_watcher.reset(new PoolWatcher(m_remote, 30, m_lock, m_cond));
   m_pool_watcher->refresh_images();
@@ -118,7 +203,7 @@ void Replayer::run()
   }
 
   // Stopping
-  map<int64_t, set<string> > empty_sources;
+  PoolImageIds empty_sources;
   while (true) {
     Mutex::Locker l(m_lock);
     set_sources(empty_sources);
@@ -129,7 +214,53 @@ void Replayer::run()
   }
 }
 
-void Replayer::set_sources(const map<int64_t, set<string> > &images)
+void Replayer::print_status(Formatter *f, stringstream *ss)
+{
+  dout(20) << "enter" << dendl;
+
+  Mutex::Locker l(m_lock);
+
+  if (f) {
+    f->open_object_section("replayer_status");
+    f->dump_stream("peer") << m_peer;
+    f->open_array_section("image_replayers");
+  };
+
+  for (auto it = m_images.begin(); it != m_images.end(); it++) {
+    auto &pool_images = it->second;
+    for (auto i = pool_images.begin(); i != pool_images.end(); i++) {
+      auto &image_replayer = i->second;
+      image_replayer->print_status(f, ss);
+    }
+  }
+
+  if (f) {
+    f->close_section();
+    f->close_section();
+    f->flush(*ss);
+  }
+}
+
+void Replayer::flush()
+{
+  dout(20) << "enter" << dendl;
+
+  Mutex::Locker l(m_lock);
+
+  if (m_stopping.read()) {
+    return;
+  }
+
+  for (auto it = m_images.begin(); it != m_images.end(); it++) {
+    auto &pool_images = it->second;
+    for (auto i = pool_images.begin(); i != pool_images.end(); i++) {
+      auto &image_replayer = i->second;
+      image_replayer->flush();
+    }
+  }
+}
+
+void Replayer::set_sources(const PoolImageIds &pool_image_ids)
 {
   dout(20) << "enter" << dendl;
 
@@ -137,24 +268,34 @@ void Replayer::set_sources(const map<int64_t, set<string> > &images)
   for (auto it = m_images.begin(); it != m_images.end();) {
     int64_t pool_id = it->first;
     auto &pool_images = it->second;
-    if (images.find(pool_id) == images.end()) {
+
+    // pool has no mirrored images
+    if (pool_image_ids.find(pool_id) == pool_image_ids.end()) {
       for (auto images_it = pool_images.begin();
 	   images_it != pool_images.end();) {
 	if (stop_image_replayer(images_it->second)) {
-	  pool_images.erase(images_it++);
-	}
+	  images_it = pool_images.erase(images_it);
+	} else {
+          ++images_it;
+        }
       }
       if (pool_images.empty()) {
-	m_images.erase(it++);
+	it = m_images.erase(it);
+      } else {
+        ++it;
       }
       continue;
     }
+
+    // shut down replayers for non-mirrored images
     for (auto images_it = pool_images.begin();
 	 images_it != pool_images.end();) {
-      if (images.at(pool_id).find(images_it->first) ==
-	  images.at(pool_id).end()) {
+      auto &image_ids = pool_image_ids.at(pool_id);
+      if (image_ids.find(ImageIds(images_it->first)) == image_ids.end()) {
 	if (stop_image_replayer(images_it->second)) {
-	  pool_images.erase(images_it++);
+	  images_it = pool_images.erase(images_it);
+	} else {
+	  ++images_it;
 	}
       } else {
 	++images_it;
@@ -163,7 +304,8 @@ void Replayer::set_sources(const map<int64_t, set<string> > &images)
     ++it;
   }
 
-  for (const auto &kv : images) {
+  // (re)start new image replayers
+  for (const auto &kv : pool_image_ids) {
     int64_t pool_id = kv.first;
 
     // TODO: clean up once remote peer -> image replayer refactored
@@ -183,27 +325,39 @@ void Replayer::set_sources(const map<int64_t, set<string> > &images)
       continue;
     }
 
+    std::string local_mirror_uuid;
+    r = librbd::cls_client::mirror_uuid_get(&local_ioctx, &local_mirror_uuid);
+    if (r < 0) {
+      derr << "failed to retrieve local mirror uuid from pool "
+        << local_ioctx.get_pool_name() << ": " << cpp_strerror(r) << dendl;
+      continue;
+    }
+
+    std::string remote_mirror_uuid;
+    r = librbd::cls_client::mirror_uuid_get(&remote_ioctx, &remote_mirror_uuid);
+    if (r < 0) {
+      derr << "failed to retrieve remote mirror uuid from pool "
+        << remote_ioctx.get_pool_name() << ": " << cpp_strerror(r) << dendl;
+      continue;
+    }
+
     // create entry for pool if it doesn't exist
     auto &pool_replayers = m_images[pool_id];
     for (const auto &image_id : kv.second) {
-      auto it = pool_replayers.find(image_id);
+      auto it = pool_replayers.find(image_id.id);
       if (it == pool_replayers.end()) {
-	unique_ptr<ImageReplayer> image_replayer(new ImageReplayer(m_threads,
-								   m_local,
-								   m_remote,
-								   m_client_id,
-								   local_ioctx.get_id(),
-								   pool_id,
-								   image_id));
+	unique_ptr<ImageReplayer<> > image_replayer(new ImageReplayer<>(
+          m_threads, m_local, m_remote, local_mirror_uuid, remote_mirror_uuid,
+          local_ioctx.get_id(), pool_id, image_id.id, image_id.global_id));
 	it = pool_replayers.insert(
-	  std::make_pair(image_id, std::move(image_replayer))).first;
+	  std::make_pair(image_id.id, std::move(image_replayer))).first;
       }
       start_image_replayer(it->second);
     }
   }
 }
 
-void Replayer::start_image_replayer(unique_ptr<ImageReplayer> &image_replayer)
+void Replayer::start_image_replayer(unique_ptr<ImageReplayer<> > &image_replayer)
 {
   if (!image_replayer->is_stopped()) {
     return;
@@ -212,7 +366,7 @@ void Replayer::start_image_replayer(unique_ptr<ImageReplayer> &image_replayer)
   image_replayer->start();
 }
 
-bool Replayer::stop_image_replayer(unique_ptr<ImageReplayer> &image_replayer)
+bool Replayer::stop_image_replayer(unique_ptr<ImageReplayer<> > &image_replayer)
 {
   if (image_replayer->is_stopped()) {
     return true;
diff --git a/src/tools/rbd_mirror/Replayer.h b/src/tools/rbd_mirror/Replayer.h
index 83748b9..f7c623b 100644
--- a/src/tools/rbd_mirror/Replayer.h
+++ b/src/tools/rbd_mirror/Replayer.h
@@ -24,6 +24,7 @@ namespace rbd {
 namespace mirror {
 
 struct Threads;
+class ReplayerAdminSocketHook;
 
 /**
  * Controls mirroring for a single remote cluster.
@@ -38,13 +39,18 @@ public:
 
   int init();
   void run();
-  void shutdown();
+
+  void print_status(Formatter *f, stringstream *ss);
+  void flush();
 
 private:
-  void set_sources(const std::map<int64_t, std::set<std::string> > &images);
+  typedef PoolWatcher::ImageIds ImageIds;
+  typedef PoolWatcher::PoolImageIds PoolImageIds;
+
+  void set_sources(const PoolImageIds &pool_image_ids);
 
-  void start_image_replayer(unique_ptr<ImageReplayer> &image_replayer);
-  bool stop_image_replayer(unique_ptr<ImageReplayer> &image_replayer);
+  void start_image_replayer(unique_ptr<ImageReplayer<> > &image_replayer);
+  bool stop_image_replayer(unique_ptr<ImageReplayer<> > &image_replayer);
 
   Threads *m_threads;
   Mutex m_lock;
@@ -53,13 +59,13 @@ private:
 
   peer_t m_peer;
   std::vector<const char*> m_args;
-  std::string m_client_id;
   RadosRef m_local, m_remote;
   std::unique_ptr<PoolWatcher> m_pool_watcher;
   // index by pool so it's easy to tell what is affected
   // when a pool's configuration changes
   std::map<int64_t, std::map<std::string,
-			     std::unique_ptr<ImageReplayer> > > m_images;
+			     std::unique_ptr<ImageReplayer<> > > > m_images;
+  ReplayerAdminSocketHook *m_asok_hook;
 
   class ReplayerThread : public Thread {
     Replayer *m_replayer;
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
index 4260c02..92f98c0 100644
--- a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
@@ -8,11 +8,14 @@
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
 #include "journal/Journaler.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageState.h"
 #include "librbd/internal.h"
+#include "librbd/Journal.h"
 #include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
 #include "tools/rbd_mirror/ImageSync.h"
 
 #define dout_subsys ceph_subsys_rbd_mirror
@@ -25,21 +28,28 @@ namespace mirror {
 namespace image_replayer {
 
 using librbd::util::create_context_callback;
+using librbd::util::create_rados_ack_callback;
 
 namespace {
 
 template <typename I>
 struct C_CreateImage : public Context {
   librados::IoCtx &local_io_ctx;
+  std::string global_image_id;
+  std::string remote_mirror_uuid;
   std::string local_image_name;
   I *remote_image_ctx;
   Context *on_finish;
 
   C_CreateImage(librados::IoCtx &local_io_ctx,
+                const std::string &global_image_id,
+                const std::string &remote_mirror_uuid,
                 const std::string &local_image_name, I *remote_image_ctx,
                 Context *on_finish)
-    : local_io_ctx(local_io_ctx), local_image_name(local_image_name),
-      remote_image_ctx(remote_image_ctx), on_finish(on_finish) {
+    : local_io_ctx(local_io_ctx), global_image_id(global_image_id),
+      remote_mirror_uuid(remote_mirror_uuid),
+      local_image_name(local_image_name), remote_image_ctx(remote_image_ctx),
+      on_finish(on_finish) {
   }
 
   virtual void finish(int r) override {
@@ -48,11 +58,24 @@ struct C_CreateImage : public Context {
     // TODO: rbd-mirror should offer a feature mask capability
     RWLock::RLocker snap_locker(remote_image_ctx->snap_lock);
     int order = remote_image_ctx->order;
-    r = librbd::create(local_io_ctx, local_image_name.c_str(),
-                       remote_image_ctx->size, false,
-                       remote_image_ctx->features, &order,
-                       remote_image_ctx->stripe_unit,
-                       remote_image_ctx->stripe_count);
+
+    CephContext *cct = reinterpret_cast<CephContext*>(local_io_ctx.cct());
+    uint64_t journal_order = cct->_conf->rbd_journal_order;
+    uint64_t journal_splay_width = cct->_conf->rbd_journal_splay_width;
+    std::string journal_pool = cct->_conf->rbd_journal_pool;
+
+    // NOTE: bid is 64bit but overflow will result due to
+    // RBD_MAX_BLOCK_NAME_SIZE being too small
+    librados::Rados rados(local_io_ctx);
+    uint64_t bid = rados.get_instance_id();
+
+    r = librbd::create_v2(local_io_ctx, local_image_name.c_str(), bid,
+                          remote_image_ctx->size, order,
+                          remote_image_ctx->features,
+                          remote_image_ctx->stripe_unit,
+                          remote_image_ctx->stripe_count,
+                          journal_order, journal_splay_width, journal_pool,
+                          global_image_id, remote_mirror_uuid);
     on_finish->complete(r);
   }
 };
@@ -65,17 +88,21 @@ BootstrapRequest<I>::BootstrapRequest(librados::IoCtx &local_io_ctx,
                                       I **local_image_ctx,
                                       const std::string &local_image_name,
                                       const std::string &remote_image_id,
+                                      const std::string &global_image_id,
                                       ContextWQ *work_queue, SafeTimer *timer,
                                       Mutex *timer_lock,
-                                      const std::string &mirror_uuid,
+                                      const std::string &local_mirror_uuid,
+                                      const std::string &remote_mirror_uuid,
                                       Journaler *journaler,
                                       MirrorPeerClientMeta *client_meta,
                                       Context *on_finish)
   : m_local_io_ctx(local_io_ctx), m_remote_io_ctx(remote_io_ctx),
     m_local_image_ctx(local_image_ctx), m_local_image_name(local_image_name),
-    m_remote_image_id(remote_image_id), m_work_queue(work_queue),
-    m_timer(timer), m_timer_lock(timer_lock), m_mirror_uuid(mirror_uuid),
-    m_journaler(journaler), m_client_meta(client_meta), m_on_finish(on_finish) {
+    m_remote_image_id(remote_image_id), m_global_image_id(global_image_id),
+    m_work_queue(work_queue), m_timer(timer), m_timer_lock(timer_lock),
+    m_local_mirror_uuid(local_mirror_uuid),
+    m_remote_mirror_uuid(remote_mirror_uuid), m_journaler(journaler),
+    m_client_meta(client_meta), m_on_finish(on_finish) {
 }
 
 template <typename I>
@@ -85,64 +112,206 @@ BootstrapRequest<I>::~BootstrapRequest() {
 
 template <typename I>
 void BootstrapRequest<I>::send() {
-  open_remote_image();
+  get_local_image_id();
 }
 
 template <typename I>
-void BootstrapRequest<I>::open_remote_image() {
+void BootstrapRequest<I>::get_local_image_id() {
   dout(20) << dendl;
 
-  // TODO: need factory method to support mocking
-  m_remote_image_ctx = new I("", m_remote_image_id, nullptr, m_remote_io_ctx,
-                             false);
+  // attempt to cross-reference a local image by the global image id
+  librados::ObjectReadOperation op;
+  librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
+
+  librados::AioCompletion *aio_comp = create_rados_ack_callback<
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_local_image_id>(
+      this);
+  int r = m_local_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_get_local_image_id(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  if (r == 0) {
+    bufferlist::iterator iter = m_out_bl.begin();
+    r = librbd::cls_client::mirror_image_get_image_id_finish(
+      &iter, &m_local_image_id);
+  }
+
+  if (r == -ENOENT) {
+    dout(10) << ": image not registered locally" << dendl;
+  } else if (r < 0) {
+    derr << ": failed to retreive local image id: " << cpp_strerror(r) << dendl;
+    finish(r);
+    return;
+  }
+
+  get_remote_tag_class();
+}
+
+template <typename I>
+void BootstrapRequest<I>::get_remote_tag_class() {
+  dout(20) << dendl;
 
   Context *ctx = create_context_callback<
-    BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_remote_image>(
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tag_class>(
       this);
-  m_remote_image_ctx->state->open(ctx);
+  m_journaler->get_client(librbd::Journal<>::IMAGE_CLIENT_ID, &m_client, ctx);
 }
 
 template <typename I>
-void BootstrapRequest<I>::handle_open_remote_image(int r) {
+void BootstrapRequest<I>::handle_get_remote_tag_class(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "failed to open remote image: " << cpp_strerror(r) << dendl;
-    m_ret_val = r;
-    close_remote_image();
+    derr << ": failed to retreive remote client: " << cpp_strerror(r) << dendl;
+    finish(r);
     return;
   }
 
-  create_local_image();
+  librbd::journal::ClientData client_data;
+  bufferlist::iterator it = m_client.data.begin();
+  try {
+    ::decode(client_data, it);
+  } catch (const buffer::error &err) {
+    derr << ": failed to decode remote client meta data: " << err.what()
+         << dendl;
+    finish(-EBADMSG);
+    return;
+  }
+
+  librbd::journal::ImageClientMeta *client_meta =
+    boost::get<librbd::journal::ImageClientMeta>(&client_data.client_meta);
+  if (client_meta == nullptr) {
+    derr << ": unknown remote client registration" << dendl;
+    finish(-EINVAL);
+    return;
+  }
+
+  m_remote_tag_class = client_meta->tag_class;
+  dout(10) << ": remote tag class=" << m_remote_tag_class << dendl;
+
+  get_client();
 }
 
 template <typename I>
-void BootstrapRequest<I>::create_local_image() {
+void BootstrapRequest<I>::get_client() {
   dout(20) << dendl;
 
-  // TODO: local image might already exist (e.g. interrupted sync)
-  //       need to determine what type of bootstrap we are performing
+  Context *ctx = create_context_callback<
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_client>(
+      this);
+  m_journaler->get_client(m_local_mirror_uuid, &m_client, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_get_client(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  if (r == -ENOENT) {
+    dout(10) << ": client not registered" << dendl;
+  } else if (r < 0) {
+    derr << ": failed to retreive client: " << cpp_strerror(r) << dendl;
+    finish(r);
+    return;
+  } else if (decode_client_meta()) {
+    // skip registration if it already exists
+    open_remote_image();
+    return;
+  }
+
+  register_client();
+}
+
+template <typename I>
+void BootstrapRequest<I>::register_client() {
+  dout(20) << dendl;
+
+  // record an place-holder record
+  librbd::journal::ClientData client_data{
+    librbd::journal::MirrorPeerClientMeta{m_local_image_id}};
+  bufferlist client_data_bl;
+  ::encode(client_data, client_data_bl);
 
-  // TODO: librbd should provide an AIO image creation method -- this is
-  //       blocking so we execute in our worker thread
   Context *ctx = create_context_callback<
-    BootstrapRequest<I>, &BootstrapRequest<I>::handle_create_local_image>(
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_register_client>(
       this);
-  m_work_queue->queue(new C_CreateImage<I>(m_local_io_ctx, m_local_image_name,
-                                           m_remote_image_ctx, ctx), 0);
+  m_journaler->register_client(client_data_bl, ctx);
 }
 
 template <typename I>
-void BootstrapRequest<I>::handle_create_local_image(int r) {
+void BootstrapRequest<I>::handle_register_client(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  if (r < 0) {
+    derr << ": failed to register with remote journal: " << cpp_strerror(r)
+         << dendl;
+    finish(r);
+    return;
+  }
+
+  *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+  open_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::open_remote_image() {
+  dout(20) << dendl;
+
+  m_remote_image_ctx = I::create("", m_remote_image_id, nullptr,
+                                 m_remote_io_ctx, false);
+  Context *ctx = create_context_callback<
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_remote_image>(
+      this);
+  m_remote_image_ctx->state->open(ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_open_remote_image(int r) {
+  // deduce the class type for the journal to support unit tests
+  typedef typename std::decay<decltype(*I::journal)>::type Journal;
+
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+    derr << ": failed to open remote image: " << cpp_strerror(r) << dendl;
+    m_ret_val = r;
+    close_remote_image();
+    return;
+  }
+
+  // TODO: make async
+  bool tag_owner;
+  r = Journal::is_tag_owner(m_remote_image_ctx, &tag_owner);
+  if (r < 0) {
+    derr << ": failed to query remote image primary status: " << cpp_strerror(r)
+         << dendl;
     m_ret_val = r;
     close_remote_image();
     return;
   }
 
+  if (!tag_owner) {
+    dout(5) << ": remote image is not primary -- skipping image replay"
+            << dendl;
+    m_ret_val = -EREMOTEIO;
+    close_remote_image();
+    return;
+  }
+
+  // default local image name to the remote image name if not provided
+  if (m_local_image_name.empty()) {
+    m_local_image_name = m_remote_image_ctx->name;
+  }
+
+  if (m_local_image_id.empty()) {
+    create_local_image();
+    return;
+  }
+
   open_local_image();
 }
 
@@ -154,8 +323,9 @@ void BootstrapRequest<I>::open_local_image() {
     BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_local_image>(
       this);
   OpenLocalImageRequest<I> *request = OpenLocalImageRequest<I>::create(
-    m_local_io_ctx, m_local_image_ctx, m_local_image_name, "", m_work_queue,
-    ctx);
+    m_local_io_ctx, m_local_image_ctx,
+    (!m_local_image_id.empty() ? std::string() : m_local_image_name),
+    m_local_image_id, m_work_queue, ctx);
   request->send();
 }
 
@@ -163,65 +333,209 @@ template <typename I>
 void BootstrapRequest<I>::handle_open_local_image(int r) {
   dout(20) << ": r=" << r << dendl;
 
-  if (r < 0) {
+  if (r == -ENOENT) {
+    assert(*m_local_image_ctx == nullptr);
+    dout(10) << ": local image missing" << dendl;
+    create_local_image();
+    return;
+  } else if (r == -EREMOTEIO) {
     assert(*m_local_image_ctx == nullptr);
-    derr << "failed to open local image: " << cpp_strerror(r) << dendl;
+    dout(10) << "local image is primary -- skipping image replay" << dendl;
+    m_ret_val = r;
+    close_remote_image();
+    return;
+  } else if (r < 0) {
+    assert(*m_local_image_ctx == nullptr);
+    derr << ": failed to open local image: " << cpp_strerror(r) << dendl;
     m_ret_val = r;
     close_remote_image();
     return;
   }
 
-  register_client();
+  update_client();
 }
 
 template <typename I>
-void BootstrapRequest<I>::register_client() {
+void BootstrapRequest<I>::remove_local_image() {
   dout(20) << dendl;
 
-  // TODO: if client fails to register newly created image to journal,
-  //       need to ensure we can recover (i.e. see if image of the same
-  //       name already exists)
+  // TODO
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_remove_local_image(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  // TODO
+}
+
+template <typename I>
+void BootstrapRequest<I>::create_local_image() {
+  dout(20) << dendl;
+
+  // TODO: librbd should provide an AIO image creation method -- this is
+  //       blocking so we execute in our worker thread
+  Context *ctx = create_context_callback<
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_create_local_image>(
+      this);
+  m_work_queue->queue(new C_CreateImage<I>(m_local_io_ctx, m_global_image_id,
+                                           m_remote_mirror_uuid,
+                                           m_local_image_name,
+                                           m_remote_image_ctx, ctx), 0);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_create_local_image(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  if (r < 0) {
+    derr << ": failed to create local image: " << cpp_strerror(r) << dendl;
+    m_ret_val = r;
+    close_remote_image();
+    return;
+  }
+
+  m_created_local_image = true;
+  open_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::update_client() {
+  if (m_client_meta->image_id == (*m_local_image_ctx)->id) {
+    // already registered local image with remote journal
+    get_remote_tags();
+    return;
+  }
+  m_local_image_id = (*m_local_image_ctx)->id;
+
+  dout(20) << dendl;
 
-  librbd::journal::MirrorPeerClientMeta client_meta(*m_client_meta);
-  client_meta.image_id = (*m_local_image_ctx)->id;
+  librbd::journal::MirrorPeerClientMeta client_meta;
+  client_meta.image_id = m_local_image_id;
 
   librbd::journal::ClientData client_data(client_meta);
-  bufferlist client_data_bl;
-  ::encode(client_data, client_data_bl);
+  bufferlist data_bl;
+  ::encode(client_data, data_bl);
 
   Context *ctx = create_context_callback<
-    BootstrapRequest<I>, &BootstrapRequest<I>::handle_register_client>(
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_update_client>(
       this);
-  m_journaler->register_client(client_data_bl, ctx);
+  m_journaler->update_client(data_bl, ctx);
 }
 
 template <typename I>
-void BootstrapRequest<I>::handle_register_client(int r) {
+void BootstrapRequest<I>::handle_update_client(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "failed to register with remote journal: " << cpp_strerror(r)
-         << dendl;
+    derr << ": failed to update client: " << cpp_strerror(r) << dendl;
+    m_ret_val = r;
+    close_local_image();
+    return;
+  }
+
+  m_client_meta->image_id = m_local_image_id;
+  get_remote_tags();
+}
+
+template <typename I>
+void BootstrapRequest<I>::get_remote_tags() {
+  if (m_created_local_image) {
+    // optimization -- no need to compare remote tags if we just created
+    // the image locally
+    image_sync();
+    return;
+  }
+
+  dout(20) << dendl;
+
+  Context *ctx = create_context_callback<
+    BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tags>(this);
+  m_journaler->get_tags(m_remote_tag_class, &m_remote_tags, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_get_remote_tags(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  if (r < 0) {
+    derr << ": failed to retreive remote tags: " << cpp_strerror(r) << dendl;
+    m_ret_val = r;
     close_local_image();
     return;
   }
 
-  m_client_meta->image_id = (*m_local_image_ctx)->id;
+  // decode the remote tags
+  librbd::journal::TagData remote_tag_data;
+  for (auto &tag : m_remote_tags) {
+    try {
+      bufferlist::iterator it = tag.data.begin();
+      ::decode(remote_tag_data, it);
+    } catch (const buffer::error &err) {
+      derr << ": failed to decode remote tag: " << err.what() << dendl;
+      m_ret_val = -EBADMSG;
+      close_local_image();
+      return;
+    }
+
+    dout(10) << ": decoded remote tag: " << remote_tag_data << dendl;
+    if (remote_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+        remote_tag_data.predecessor_mirror_uuid == m_local_mirror_uuid) {
+      // remote tag is chained off a local tag demotion
+      break;
+    }
+  }
+
+  // At this point, the local image was existing and non-primary and the remote
+  // image is primary.  Attempt to link the local image's most recent tag
+  // to the remote image's tag chain.
+  I *local_image_ctx = (*m_local_image_ctx);
+  {
+    RWLock::RLocker snap_locker(local_image_ctx->snap_lock);
+    if (local_image_ctx->journal == nullptr) {
+      derr << "local image does not support journaling" << dendl;
+      m_ret_val = -EINVAL;
+      close_local_image();
+      return;
+    }
+
+    librbd::journal::TagData tag_data =
+      local_image_ctx->journal->get_tag_data();
+    dout(20) << ": local tag data: " << tag_data << dendl;
+
+    if (!((tag_data.mirror_uuid == librbd::Journal<I>::ORPHAN_MIRROR_UUID &&
+           remote_tag_data.mirror_uuid == librbd::Journal<I>::ORPHAN_MIRROR_UUID &&
+           remote_tag_data.predecessor_mirror_uuid == m_local_mirror_uuid) ||
+          (tag_data.mirror_uuid == m_remote_mirror_uuid &&
+           m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_REPLAYING))) {
+      derr << ": split-brain detected -- skipping image replay" << dendl;
+      m_ret_val = -EEXIST;
+      close_local_image();
+      return;
+    }
+  }
+
   image_sync();
 }
 
 template <typename I>
 void BootstrapRequest<I>::image_sync() {
+  if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
+    // clean replay state -- no image sync required
+    close_remote_image();
+    return;
+  }
+
   dout(20) << dendl;
 
-  // TODO: need factory method to support mocking
   Context *ctx = create_context_callback<
     BootstrapRequest<I>, &BootstrapRequest<I>::handle_image_sync>(
       this);
-  ImageSync<I> *request = new ImageSync<I>(*m_local_image_ctx,
-                                           m_remote_image_ctx, m_timer,
-                                           m_timer_lock, m_mirror_uuid,
-                                           m_journaler, m_client_meta, ctx);
+  ImageSync<I> *request = ImageSync<I>::create(*m_local_image_ctx,
+                                               m_remote_image_ctx, m_timer,
+                                               m_timer_lock,
+                                               m_local_mirror_uuid, m_journaler,
+                                               m_client_meta, ctx);
   request->start();
 }
 
@@ -230,10 +544,8 @@ void BootstrapRequest<I>::handle_image_sync(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "failed to sync remote image: " << cpp_strerror(r) << dendl;
+    derr << ": failed to sync remote image: " << cpp_strerror(r) << dendl;
     m_ret_val = r;
-    close_local_image();
-    return;
   }
 
   close_remote_image();
@@ -247,7 +559,7 @@ void BootstrapRequest<I>::close_local_image() {
     BootstrapRequest<I>, &BootstrapRequest<I>::handle_close_local_image>(
       this);
   CloseImageRequest<I> *request = CloseImageRequest<I>::create(
-    m_local_image_ctx, m_work_queue, ctx);
+    m_local_image_ctx, m_work_queue, false, ctx);
   request->send();
 }
 
@@ -256,7 +568,7 @@ void BootstrapRequest<I>::handle_close_local_image(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "error encountered closing local image: " << cpp_strerror(r)
+    derr << ": error encountered closing local image: " << cpp_strerror(r)
          << dendl;
   }
 
@@ -271,7 +583,7 @@ void BootstrapRequest<I>::close_remote_image() {
     BootstrapRequest<I>, &BootstrapRequest<I>::handle_close_remote_image>(
       this);
   CloseImageRequest<I> *request = CloseImageRequest<I>::create(
-    &m_remote_image_ctx, m_work_queue, ctx);
+    &m_remote_image_ctx, m_work_queue, false, ctx);
   request->send();
 }
 
@@ -280,7 +592,7 @@ void BootstrapRequest<I>::handle_close_remote_image(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "error encountered closing remote image: " << cpp_strerror(r)
+    derr << ": error encountered closing remote image: " << cpp_strerror(r)
          << dendl;
   }
 
@@ -295,6 +607,35 @@ void BootstrapRequest<I>::finish(int r) {
   delete this;
 }
 
+template <typename I>
+bool BootstrapRequest<I>::decode_client_meta() {
+  dout(20) << dendl;
+
+  librbd::journal::ClientData client_data;
+  bufferlist::iterator it = m_client.data.begin();
+  try {
+    ::decode(client_data, it);
+  } catch (const buffer::error &err) {
+    derr << ": failed to decode client meta data: " << err.what() << dendl;
+    return true;
+  }
+
+  librbd::journal::MirrorPeerClientMeta *client_meta =
+    boost::get<librbd::journal::MirrorPeerClientMeta>(&client_data.client_meta);
+  if (client_meta == nullptr) {
+    derr << ": unknown peer registration" << dendl;
+    return true;
+  } else if (!client_meta->image_id.empty()) {
+    // have an image id -- use that to open the image
+    m_local_image_id = client_meta->image_id;
+  }
+
+  *m_client_meta = *client_meta;
+
+  dout(20) << ": client found: image_id=" << m_local_image_id << dendl;
+  return true;
+}
+
 } // namespace image_replayer
 } // namespace mirror
 } // namespace rbd
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
index 24c6866..bf9629f 100644
--- a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
@@ -6,7 +6,9 @@
 
 #include "include/int_types.h"
 #include "include/rados/librados.hpp"
-#include "librbd/Journal.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <list>
 #include <string>
 
 class Context;
@@ -15,6 +17,7 @@ class Mutex;
 class SafeTimer;
 namespace journal { class Journaler; }
 namespace librbd { class ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
 
 namespace rbd {
 namespace mirror {
@@ -32,25 +35,30 @@ public:
                                   ImageCtxT **local_image_ctx,
                                   const std::string &local_image_name,
                                   const std::string &remote_image_id,
+                                  const std::string &global_image_id,
                                   ContextWQ *work_queue, SafeTimer *timer,
                                   Mutex *timer_lock,
-                                  const std::string &mirror_uuid,
+                                  const std::string &local_mirror_uuid,
+                                  const std::string &remote_mirror_uuid,
                                   Journaler *journaler,
                                   MirrorPeerClientMeta *client_meta,
                                   Context *on_finish) {
     return new BootstrapRequest(local_io_ctx, remote_io_ctx, local_image_ctx,
-                                local_image_name, remote_image_id, work_queue,
-                                timer, timer_lock, mirror_uuid, journaler,
-                                client_meta, on_finish);
+                                local_image_name, remote_image_id,
+                                global_image_id, work_queue, timer, timer_lock,
+                                local_mirror_uuid, remote_mirror_uuid,
+                                journaler, client_meta, on_finish);
   }
 
   BootstrapRequest(librados::IoCtx &local_io_ctx,
                    librados::IoCtx &remote_io_ctx,
                    ImageCtxT **local_image_ctx,
                    const std::string &local_image_name,
-                   const std::string &remote_image_id, ContextWQ *work_queue,
+                   const std::string &remote_image_id,
+                   const std::string &global_image_id, ContextWQ *work_queue,
                    SafeTimer *timer, Mutex *timer_lock,
-                   const std::string &mirror_uuid, Journaler *journaler,
+                   const std::string &local_mirror_uuid,
+                   const std::string &remote_mirror_uuid, Journaler *journaler,
                    MirrorPeerClientMeta *client_meta, Context *on_finish);
   ~BootstrapRequest();
 
@@ -63,21 +71,45 @@ private:
    * <start>
    *    |
    *    v
-   * OPEN_REMOTE_IMAGE  * * * * * * * * * * * *
+   * GET_LOCAL_IMAGE_ID * * * * * * * * * * * *
    *    |                                     *
    *    v                                     *
-   * CREATE_LOCAL_IMAGE * * * * * * * * * * * * (error)
+   * GET_REMOTE_TAG_CLASS * * * * * * * * * * *
    *    |                                     *
    *    v                                     *
-   * OPEN_LOCAL_IMAGE * * * * * * * * * * * * *
+   * GET_CLIENT * * * * * * * * * * * * * * * *
+   *    |                                     *
+   *    v (skip if not needed)                * (error)
+   * REGISTER_CLIENT  * * * * * * * * * * * * *
+   *    |                                     *
+   *    v                                     *
+   * OPEN_REMOTE_IMAGE  * * * * * * * * * * * *
    *    |                                     *
    *    v                                     *
-   * REGISTER_CLIENT  * * * *                 *
-   *    |                   *                 *
-   *    v                   v                 *
+   * OPEN_LOCAL_IMAGE * * * * * * * * * * * * *
+   *    |   .   ^                             *
+   *    |   .   |                             *
+   *    |   .   \-----------------------\     *
+   *    |   .                           |     *
+   *    |   . (image sync requested)    |     *
+   *    |   . . > REMOVE_LOCAL_IMAGE  * * * * *
+   *    |   .                   |       |     *
+   *    |   . (image doesn't    |       |     *
+   *    |   .  exist)           v       |     *
+   *    |   . . > CREATE_LOCAL_IMAGE  * * * * *
+   *    |             |                 |     *
+   *    |             \-----------------/     *
+   *    |                                     *
+   *    v (skip if not needed)                *
+   * UPDATE_CLIENT  * * * * * * * *           *
+   *    |                         *           *
+   *    v (skip if not needed)    *           *
+   * GET_REMOTE_TAGS  * * * * * * *           *
+   *    |                         *           *
+   *    v (skip if not needed)    v           *
    * IMAGE_SYNC * * * > CLOSE_LOCAL_IMAGE     *
-   *    |                   |                 *
-   *    |     /-------------/                 *
+   *    |                         |           *
+   *    |     /-------------------/           *
    *    |     |                               *
    *    v     v                               *
    * CLOSE_REMOTE_IMAGE < * * * * * * * * * * *
@@ -87,33 +119,62 @@ private:
    *
    * @endverbatim
    */
+  typedef std::list<cls::journal::Tag> Tags;
+
   librados::IoCtx &m_local_io_ctx;
   librados::IoCtx &m_remote_io_ctx;
   ImageCtxT **m_local_image_ctx;
   std::string m_local_image_name;
+  std::string m_local_image_id;
   std::string m_remote_image_id;
+  std::string m_global_image_id;
   ContextWQ *m_work_queue;
   SafeTimer *m_timer;
   Mutex *m_timer_lock;
-  std::string m_mirror_uuid;
+  std::string m_local_mirror_uuid;
+  std::string m_remote_mirror_uuid;
   Journaler *m_journaler;
   MirrorPeerClientMeta *m_client_meta;
   Context *m_on_finish;
 
+  Tags m_remote_tags;
+  cls::journal::Client m_client;
+  uint64_t m_remote_tag_class = 0;
   ImageCtxT *m_remote_image_ctx = nullptr;
+  bool m_created_local_image = false;
   int m_ret_val = 0;
 
+  bufferlist m_out_bl;
+
+  void get_local_image_id();
+  void handle_get_local_image_id(int r);
+
+  void get_remote_tag_class();
+  void handle_get_remote_tag_class(int r);
+
+  void get_client();
+  void handle_get_client(int r);
+
+  void register_client();
+  void handle_register_client(int r);
+
   void open_remote_image();
   void handle_open_remote_image(int r);
 
+  void open_local_image();
+  void handle_open_local_image(int r);
+
+  void remove_local_image();
+  void handle_remove_local_image(int r);
+
   void create_local_image();
   void handle_create_local_image(int r);
 
-  void open_local_image();
-  void handle_open_local_image(int r);
+  void update_client();
+  void handle_update_client(int r);
 
-  void register_client();
-  void handle_register_client(int r);
+  void get_remote_tags();
+  void handle_get_remote_tags(int r);
 
   void image_sync();
   void handle_image_sync(int r);
@@ -126,6 +187,7 @@ private:
 
   void finish(int r);
 
+  bool decode_client_meta();
 };
 
 } // namespace image_replayer
diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
index 3751245..247c629 100644
--- a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
+++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
@@ -22,8 +22,9 @@ using librbd::util::create_context_callback;
 
 template <typename I>
 CloseImageRequest<I>::CloseImageRequest(I **image_ctx, ContextWQ *work_queue,
-                                        Context *on_finish)
-  : m_image_ctx(image_ctx), m_work_queue(work_queue), m_on_finish(on_finish) {
+                                        bool destroy_only, Context *on_finish)
+  : m_image_ctx(image_ctx), m_work_queue(work_queue),
+    m_destroy_only(destroy_only), m_on_finish(on_finish) {
 }
 
 template <typename I>
@@ -33,6 +34,11 @@ void CloseImageRequest<I>::send() {
 
 template <typename I>
 void CloseImageRequest<I>::close_image() {
+  if (m_destroy_only) {
+    switch_thread_context();
+    return;
+  }
+
   dout(20) << dendl;
 
   Context *ctx = create_context_callback<
@@ -45,7 +51,7 @@ void CloseImageRequest<I>::handle_close_image(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "error encountered while closing image: " << cpp_strerror(r)
+    derr << ": error encountered while closing image: " << cpp_strerror(r)
          << dendl;
   }
 
diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
index 8c43297..dddad47 100644
--- a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
+++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
@@ -20,12 +20,13 @@ template <typename ImageCtxT = librbd::ImageCtx>
 class CloseImageRequest {
 public:
   static CloseImageRequest* create(ImageCtxT **image_ctx, ContextWQ *work_queue,
-                                   Context *on_finish) {
-    return new CloseImageRequest(image_ctx, work_queue, on_finish);
+                                   bool destroy_only, Context *on_finish) {
+    return new CloseImageRequest(image_ctx, work_queue, destroy_only,
+                                 on_finish);
   }
 
   CloseImageRequest(ImageCtxT **image_ctx, ContextWQ *work_queue,
-                    Context *on_finish);
+                    bool destroy_only, Context *on_finish);
 
   void send();
 
@@ -36,7 +37,7 @@ private:
    * <start>
    *    |
    *    v
-   * CLOSE_IMAGE
+   * CLOSE_IMAGE (skip if not needed)
    *    |
    *    v
    * SWITCH_CONTEXT
@@ -48,6 +49,7 @@ private:
    */
   ImageCtxT **m_image_ctx;
   ContextWQ *m_work_queue;
+  bool m_destroy_only;
   Context *m_on_finish;
 
   void close_image();
diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
index 4e40c4e..9367ed6 100644
--- a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
+++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
@@ -8,12 +8,16 @@
 #include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageState.h"
+#include "librbd/Journal.h"
 #include "librbd/Utils.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/journal/Policy.h"
+#include <type_traits>
 
 #define dout_subsys ceph_subsys_rbd_mirror
 #undef dout_prefix
 #define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenLocalImageRequest: " \
-                           << this << " " << __func__
+                           << this << " " << __func__ << " "
 
 namespace rbd {
 namespace mirror {
@@ -21,6 +25,39 @@ namespace image_replayer {
 
 using librbd::util::create_context_callback;
 
+namespace {
+
+struct MirrorExclusiveLockPolicy : public librbd::exclusive_lock::Policy {
+
+  virtual void lock_requested(bool force) {
+    // TODO: interlock is being requested (e.g. local promotion)
+    // Wait for demote event from peer or abort replay on forced
+    // promotion.
+  }
+
+};
+
+struct MirrorJournalPolicy : public librbd::journal::Policy {
+  ContextWQ *work_queue;
+
+  MirrorJournalPolicy(ContextWQ *work_queue) : work_queue(work_queue) {
+  }
+
+  virtual void allocate_tag_on_lock(Context *on_finish) {
+    // rbd-mirror will manually create tags by copying them from the peer
+    work_queue->queue(on_finish, 0);
+  }
+
+  virtual void cancel_external_replay(Context *on_finish) {
+    // TODO: journal is being closed due to a comms error.  This means
+    // the journal is being closed and the exclusive lock is being released.
+    // ImageReplayer needs to restart.
+  }
+
+};
+
+} // anonymous namespace
+
 template <typename I>
 OpenLocalImageRequest<I>::OpenLocalImageRequest(librados::IoCtx &local_io_ctx,
                                                 I **local_image_ctx,
@@ -42,9 +79,16 @@ template <typename I>
 void OpenLocalImageRequest<I>::send_open_image() {
   dout(20) << dendl;
 
-  *m_local_image_ctx = new librbd::ImageCtx(m_local_image_name,
-                                            m_local_image_id, nullptr,
-                                            m_local_io_ctx, false);
+  *m_local_image_ctx = I::create(m_local_image_name, m_local_image_id, nullptr,
+                                 m_local_io_ctx, false);
+  {
+    RWLock::WLocker owner_locker((*m_local_image_ctx)->owner_lock);
+    RWLock::WLocker snap_locker((*m_local_image_ctx)->snap_lock);
+    (*m_local_image_ctx)->set_exclusive_lock_policy(
+      new MirrorExclusiveLockPolicy());
+    (*m_local_image_ctx)->set_journal_policy(
+      new MirrorJournalPolicy(m_work_queue));
+  }
 
   Context *ctx = create_context_callback<
     OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_open_image>(
@@ -57,13 +101,9 @@ void OpenLocalImageRequest<I>::handle_open_image(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "failed to open image '" << m_local_image_id << "': "
+    derr << ": failed to open image '" << m_local_image_id << "': "
          << cpp_strerror(r) << dendl;
-    send_close_image(r);
-    return;
-  } else if ((*m_local_image_ctx)->exclusive_lock == nullptr) {
-    derr << "image does not support exclusive lock" << dendl;
-    send_close_image(-EINVAL);
+    send_close_image(true, r);
     return;
   }
 
@@ -72,13 +112,39 @@ void OpenLocalImageRequest<I>::handle_open_image(int r) {
 
 template <typename I>
 void OpenLocalImageRequest<I>::send_lock_image() {
+  // deduce the class type for the journal to support unit tests
+  typedef typename std::decay<decltype(*I::journal)>::type Journal;
+
   dout(20) << dendl;
 
+  RWLock::RLocker owner_locker((*m_local_image_ctx)->owner_lock);
+  if ((*m_local_image_ctx)->exclusive_lock == nullptr) {
+    derr << ": image does not support exclusive lock" << dendl;
+    send_close_image(false, -EINVAL);
+    return;
+  }
+
+  // TODO: make an async version
+  bool tag_owner;
+  int r = Journal::is_tag_owner(*m_local_image_ctx, &tag_owner);
+  if (r < 0) {
+    derr << ": failed to query journal: " << cpp_strerror(r) << dendl;
+    send_close_image(false, r);
+    return;
+  }
+
+  // if the local image owns the tag -- don't steal the lock since
+  // we aren't going to mirror peer data into this image anyway
+  if (tag_owner) {
+    dout(10) << ": local image is primary -- skipping image replay" << dendl;
+    send_close_image(false, -EREMOTEIO);
+    return;
+  }
+
   Context *ctx = create_context_callback<
     OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_lock_image>(
       this);
 
-  RWLock::RLocker owner_locker((*m_local_image_ctx)->owner_lock);
   (*m_local_image_ctx)->exclusive_lock->request_lock(ctx);
 }
 
@@ -87,14 +153,14 @@ void OpenLocalImageRequest<I>::handle_lock_image(int r) {
   dout(20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    derr << "failed to lock image '" << m_local_image_id << "': "
+    derr << ": failed to lock image '" << m_local_image_id << "': "
        << cpp_strerror(r) << dendl;
-    send_close_image(r);
+    send_close_image(false, r);
     return;
   } else if ((*m_local_image_ctx)->exclusive_lock == nullptr ||
              !(*m_local_image_ctx)->exclusive_lock->is_lock_owner()) {
-    derr << "image is not locked" << dendl;
-    send_close_image(-EBUSY);
+    derr << ": image is not locked" << dendl;
+    send_close_image(false, -EBUSY);
     return;
   }
 
@@ -102,7 +168,7 @@ void OpenLocalImageRequest<I>::handle_lock_image(int r) {
 }
 
 template <typename I>
-void OpenLocalImageRequest<I>::send_close_image(int r) {
+void OpenLocalImageRequest<I>::send_close_image(bool destroy_only, int r) {
   dout(20) << dendl;
 
   if (m_ret_val == 0 && r < 0) {
@@ -113,7 +179,7 @@ void OpenLocalImageRequest<I>::send_close_image(int r) {
     OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_close_image>(
       this);
   CloseImageRequest<I> *request = CloseImageRequest<I>::create(
-    m_local_image_ctx, m_work_queue, ctx);
+    m_local_image_ctx, m_work_queue, destroy_only, ctx);
   request->send();
 }
 
diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
index daf5c0a..e40b1c2 100644
--- a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
+++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
@@ -46,13 +46,13 @@ private:
    * <start>
    *    |
    *    v
-   * OPEN_IMAGE * * * * * * *
-   *    |                   *
-   *    v                   v
+   * OPEN_IMAGE * * * * * * * *
+   *    |                     *
+   *    v (skip if primary)   v
    * LOCK_IMAGE * * * > CLOSE_IMAGE
-   *    |                   |
-   *    v                   |
-   * <finish> <-------------/
+   *    |                     |
+   *    v                     |
+   * <finish> <---------------/
    *
    * @endverbatim
    */
@@ -71,7 +71,7 @@ private:
   void send_lock_image();
   void handle_lock_image(int r);
 
-  void send_close_image(int r);
+  void send_close_image(bool destroy_only, int r);
   void handle_close_image(int r);
 
   void finish(int r);
diff --git a/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc b/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
index aedf2f5..f627f17 100644
--- a/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
+++ b/src/tools/rbd_mirror/image_sync/ImageCopyRequest.cc
@@ -69,7 +69,7 @@ void ImageCopyRequest<I>::send_update_max_object_count() {
     }
   }
 
-  if (max_objects == m_client_meta->sync_object_count) {
+  if (max_objects <= m_client_meta->sync_object_count) {
     send_object_copies();
     return;
   }
@@ -96,18 +96,14 @@ void ImageCopyRequest<I>::handle_update_max_object_count(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to update client data: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to update client data: " << cpp_strerror(r)
+               << dendl;
     finish(r);
     return;
   }
 
   // update provided meta structure to reflect reality
   m_client_meta->sync_object_count = m_client_meta_copy.sync_object_count;
-  m_object_no = 0;
-  if (m_sync_point->object_number) {
-    m_object_no = *m_sync_point->object_number + 1;
-  }
-  m_end_object_no = m_client_meta_copy.sync_object_count;
 
   send_object_copies();
 }
@@ -115,6 +111,16 @@ void ImageCopyRequest<I>::handle_update_max_object_count(int r) {
 template <typename I>
 void ImageCopyRequest<I>::send_object_copies() {
   CephContext *cct = m_local_image_ctx->cct;
+
+  m_object_no = 0;
+  if (m_sync_point->object_number) {
+    m_object_no = *m_sync_point->object_number + 1;
+  }
+  m_end_object_no = m_client_meta->sync_object_count;
+
+  dout(20) << ": start_object=" << m_object_no << ", "
+           << "end_object=" << m_end_object_no << dendl;
+
   bool complete;
   {
     Mutex::Locker locker(m_lock);
@@ -166,7 +172,7 @@ void ImageCopyRequest<I>::handle_object_copy(int r) {
     --m_current_ops;
 
     if (r < 0) {
-      lderr(cct) << "object copy failed: " << cpp_strerror(r) << dendl;
+      lderr(cct) << ": object copy failed: " << cpp_strerror(r) << dendl;
       if (m_ret_val == 0) {
         m_ret_val = r;
       }
@@ -216,7 +222,8 @@ void ImageCopyRequest<I>::handle_flush_sync_point(int r) {
   if (r < 0) {
     *m_client_meta = m_client_meta_copy;
 
-    lderr(cct) << "failed to update client data: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to update client data: " << cpp_strerror(r)
+               << dendl;
     finish(r);
     return;
   }
@@ -243,7 +250,7 @@ int ImageCopyRequest<I>::compute_snap_map() {
     RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock);
     snap_id_end = m_remote_image_ctx->get_snap_id(m_sync_point->snap_name);
     if (snap_id_end == CEPH_NOSNAP) {
-      lderr(cct) << "failed to locate snapshot: "
+      lderr(cct) << ": failed to locate snapshot: "
                  << m_sync_point->snap_name << dendl;
       return -ENOENT;
     }
@@ -252,7 +259,7 @@ int ImageCopyRequest<I>::compute_snap_map() {
       snap_id_start = m_remote_image_ctx->get_snap_id(
         m_sync_point->from_snap_name);
       if (snap_id_start == CEPH_NOSNAP) {
-        lderr(cct) << "failed to locate from snapshot: "
+        lderr(cct) << ": failed to locate from snapshot: "
                    << m_sync_point->from_snap_name << dendl;
         return -ENOENT;
       }
@@ -273,7 +280,7 @@ int ImageCopyRequest<I>::compute_snap_map() {
   }
 
   if (m_snap_map.empty()) {
-    lderr(cct) << "failed to map snapshots within boundary" << dendl;
+    lderr(cct) << ": failed to map snapshots within boundary" << dendl;
     return -EINVAL;
   }
 
diff --git a/src/tools/rbd_mirror/image_sync/ImageCopyRequest.h b/src/tools/rbd_mirror/image_sync/ImageCopyRequest.h
index f2d1396..0d1f5e3 100644
--- a/src/tools/rbd_mirror/image_sync/ImageCopyRequest.h
+++ b/src/tools/rbd_mirror/image_sync/ImageCopyRequest.h
@@ -7,12 +7,13 @@
 #include "include/int_types.h"
 #include "include/rados/librados.hpp"
 #include "common/Mutex.h"
-#include "librbd/Journal.h"
 #include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
 #include <map>
 #include <vector>
 
 class Context;
+class SafeTimer;
 namespace journal { class Journaler; }
 namespace librbd { struct ImageCtx; }
 
diff --git a/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc b/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
index c49beed..7cf1292 100644
--- a/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
+++ b/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
@@ -74,7 +74,7 @@ void ObjectCopyRequest<I>::handle_list_snaps(int r) {
     return;
   }
   if (r < 0) {
-    lderr(cct) << "failed to list snaps: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to list snaps: " << cpp_strerror(r) << dendl;
     finish(r);
     return;
   }
@@ -142,7 +142,7 @@ void ObjectCopyRequest<I>::handle_read_object(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to read from remote object: " << cpp_strerror(r)
+    lderr(cct) << ": failed to read from remote object: " << cpp_strerror(r)
                << dendl;
     finish(r);
     return;
@@ -208,7 +208,7 @@ void ObjectCopyRequest<I>::handle_write_object(int r) {
     r = 0;
   }
   if (r < 0) {
-    lderr(cct) << "failed to write to local object: " << cpp_strerror(r)
+    lderr(cct) << ": failed to write to local object: " << cpp_strerror(r)
                << dendl;
     finish(r);
     return;
diff --git a/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.cc b/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.cc
index 63202bc..53dc1e3 100644
--- a/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.cc
+++ b/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.cc
@@ -73,7 +73,7 @@ void SnapshotCopyRequest<I>::send_snap_remove() {
                      [](const std::pair<librados::snap_t, librbd::SnapInfo>& pair) {
             return pair.second.parent.spec.pool_id != -1;
           }) != m_remote_image_ctx->snap_info.end()) {
-      lderr(cct) << "cloned images are not currentl supported" << dendl;
+      lderr(cct) << ": cloned images are not currently supported" << dendl;
       finish(-EINVAL);
       return;
     }
@@ -118,7 +118,7 @@ void SnapshotCopyRequest<I>::handle_snap_remove(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to remove snapshot '" << m_snap_name << "': "
+    lderr(cct) << ": failed to remove snapshot '" << m_snap_name << "': "
                << cpp_strerror(r) << dendl;
     finish(r);
     return;
@@ -168,7 +168,7 @@ void SnapshotCopyRequest<I>::handle_snap_create(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to create snapshot '" << m_snap_name << "': "
+    lderr(cct) << ": failed to create snapshot '" << m_snap_name << "': "
                << cpp_strerror(r) << dendl;
     finish(r);
     return;
@@ -215,7 +215,8 @@ void SnapshotCopyRequest<I>::handle_update_client(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to update client data: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to update client data: " << cpp_strerror(r)
+               << dendl;
     finish(r);
     return;
   }
diff --git a/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.h b/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.h
index 44368f2..b94612b 100644
--- a/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.h
+++ b/src/tools/rbd_mirror/image_sync/SnapshotCopyRequest.h
@@ -8,7 +8,7 @@
 #include "include/rados/librados.hpp"
 #include "common/snap_types.h"
 #include "librbd/ImageCtx.h"
-#include "librbd/Journal.h"
+#include "librbd/journal/TypeTraits.h"
 #include <map>
 #include <set>
 #include <string>
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
index 7446049..589f532 100644
--- a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
+++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
@@ -79,7 +79,8 @@ void SyncPointCreateRequest<I>::handle_update_client(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to update client data: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to update client data: " << cpp_strerror(r)
+               << dendl;
     finish(r);
     return;
   }
@@ -107,7 +108,7 @@ void SyncPointCreateRequest<I>::handle_refresh_image(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "remote image refresh failed: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": remote image refresh failed: " << cpp_strerror(r) << dendl;
     finish(r);
     return;
   }
@@ -138,7 +139,7 @@ void SyncPointCreateRequest<I>::handle_create_snap(int r) {
     send_update_client();
     return;
   } else if (r < 0) {
-    lderr(cct) << "failed to create snapshot: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to create snapshot: " << cpp_strerror(r) << dendl;
     finish(r);
     return;
   }
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
index ce09eda..0aef0f8 100644
--- a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
+++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
@@ -4,8 +4,8 @@
 #ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
 #define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
 
-#include "librbd/Journal.h"
 #include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
 #include <string>
 
 class Context;
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
index 6c653e4..332cb00 100644
--- a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
+++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
@@ -113,7 +113,7 @@ void SyncPointPruneRequest<I>::handle_remove_snap(int r) {
     r = 0;
   }
   if (r < 0) {
-    lderr(cct) << "failed to remove snapshot '" << snap_name << "': "
+    lderr(cct) << ": failed to remove snapshot '" << snap_name << "': "
                << cpp_strerror(r) << dendl;
     finish(r);
     return;
@@ -139,7 +139,7 @@ void SyncPointPruneRequest<I>::handle_refresh_image(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "remote image refresh failed: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": remote image refresh failed: " << cpp_strerror(r) << dendl;
     finish(r);
     return;
   }
@@ -154,6 +154,9 @@ void SyncPointPruneRequest<I>::send_update_client() {
 
   if (m_sync_complete) {
     m_client_meta_copy.sync_points.pop_front();
+    if (m_client_meta_copy.sync_points.empty()) {
+      m_client_meta_copy.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+    }
   } else {
     while (m_client_meta_copy.sync_points.size() > 1) {
       m_client_meta_copy.sync_points.pop_back();
@@ -176,7 +179,8 @@ void SyncPointPruneRequest<I>::handle_update_client(int r) {
   ldout(cct, 20) << ": r=" << r << dendl;
 
   if (r < 0) {
-    lderr(cct) << "failed to update client data: " << cpp_strerror(r) << dendl;
+    lderr(cct) << ": failed to update client data: " << cpp_strerror(r)
+               << dendl;
     finish(r);
     return;
   }
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
index b643fbb..3ef4ab6 100644
--- a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
+++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
@@ -4,8 +4,8 @@
 #ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
 #define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
 
-#include "librbd/Journal.h"
 #include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
 #include <list>
 #include <string>
 
diff --git a/src/vstart.sh b/src/vstart.sh
index 151ca1b..c81a793 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -12,8 +12,8 @@ if [ -n "$VSTART_DEST" ]; then
   CEPH_LIB=$SRC_PATH/.libs
 
   if [ -e CMakeCache.txt ]; then
-      CEPH_BIN=$VSTART_DEST/../../src
-      CEPH_LIB=$CEPH_BIN
+      CEPH_BIN=$VSTART_DEST/../../bin
+      CEPH_LIB=$VSTART_DEST/../../lib
   fi
 
   CEPH_CONF_PATH=$VSTART_DEST
@@ -21,50 +21,34 @@ if [ -n "$VSTART_DEST" ]; then
   CEPH_OUT_DIR=$VSTART_DEST/out
 fi
 
+# for running out of the CMake build directory
 if [ -e CMakeCache.txt ]; then
   # Out of tree build, learn source location from CMakeCache.txt
-  SRC_ROOT=`grep Ceph_SOURCE_DIR CMakeCache.txt | cut -d "=" -f 2`
-  [ -z "$PYBIND" ] && PYBIND=$SRC_ROOT/src/pybind
-  [ -z "$CEPH_ADM" ] && CEPH_ADM=./ceph
-  [ -z "$INIT_CEPH" ] && INIT_CEPH=./init-ceph
-  [ -z "$CEPH_BIN" ] && CEPH_BIN=src
-  [ -z "$CEPH_LIB" ] && CEPH_LIB=src
-  [ -z "$OBJCLASS_PATH" ] && OBJCLASS_PATH=src/cls
-
-  # Gather symlinks to EC plugins in one dir, because with CMake they
-  # are built into multiple locations
-  mkdir -p ec_plugins
-  for file in ./src/erasure-code/*/libec_*.so*;
-  do
-    ln -sf ../${file} ec_plugins/`basename $file`
-  done
-  [ -z "$EC_PATH" ] && EC_PATH=./ec_plugins
-  # check for compression plugins
-  mkdir -p .libs/compressor
-  for file in ./src/compressor/*/libcs_*.so*;
-  do
-    ln -sf ../${file} .libs/compressor/`basename $file`
-  done
-else
-    mkdir -p .libs/compressor
-    for f in `ls -d compressor/*/`; 
-    do 
-        cp .libs/libceph_`basename $f`.so* .libs/compressor/;
-    done
+  CEPH_ROOT=`grep Ceph_SOURCE_DIR CMakeCache.txt | cut -d "=" -f 2`
+  CEPH_BUILD_DIR=`pwd`
 fi
 
-if [ -z "$CEPH_BUILD_ROOT" ]; then
-        [ -z "$CEPH_BIN" ] && CEPH_BIN=.
-        [ -z "$CEPH_LIB" ] && CEPH_LIB=.libs
-        [ -z $EC_PATH ] && EC_PATH=$CEPH_LIB
-        [ -z $CS_PATH ] && CS_PATH=$CEPH_LIB
-        [ -z $OBJCLASS_PATH ] && OBJCLASS_PATH=$CEPH_LIB
-else
+# use CEPH_BUILD_ROOT to vstart from a 'make install' 
+if [ -n "$CEPH_BUILD_ROOT" ]; then
         [ -z $CEPH_BIN ] && CEPH_BIN=$CEPH_BUILD_ROOT/bin
         [ -z $CEPH_LIB ] && CEPH_LIB=$CEPH_BUILD_ROOT/lib
         [ -z $EC_PATH ] && EC_PATH=$CEPH_LIB/erasure-code
         [ -z $CS_PATH ] && CS_PATH=$CEPH_LIB/compressor
         [ -z $OBJCLASS_PATH ] && OBJCLASS_PATH=$CEPH_LIB/rados-classes
+elif [ -n "$CEPH_ROOT" ]; then
+        [ -z "$PYBIND" ] && PYBIND=$CEPH_ROOT/src/pybind
+        [ -z "$CEPH_BIN" ] && CEPH_BIN=$CEPH_BUILD_DIR/bin
+        [ -z "$CEPH_ADM" ] && CEPH_ADM=$CEPH_BIN/ceph
+        [ -z "$INIT_CEPH" ] && INIT_CEPH=$CEPH_BIN/init-ceph
+        [ -z "$CEPH_LIB" ] && CEPH_LIB=$CEPH_BUILD_DIR/lib
+        [ -z "$OBJCLASS_PATH" ] && OBJCLASS_PATH=$CEPH_LIB
+        [ -z "$EC_PATH" ] && EC_PATH=$CEPH_LIB
+else
+        [ -z "$CEPH_BIN" ] && CEPH_BIN=.
+        [ -z "$CEPH_LIB" ] && CEPH_LIB=.libs
+        [ -z $EC_PATH ] && EC_PATH=$CEPH_LIB
+        [ -z $CS_PATH ] && CS_PATH=$CEPH_LIB
+        [ -z $OBJCLASS_PATH ] && OBJCLASS_PATH=$CEPH_LIB
 fi
 
 if [ -z "${CEPH_VSTART_WRAPPER}" ]; then
@@ -73,18 +57,20 @@ fi
 
 [ -z "$PYBIND" ] && PYBIND=./pybind
 
-export PYTHONPATH=$PYBIND
+export PYTHONPATH=$PYBIND:$PYTHONPATH
 export LD_LIBRARY_PATH=$CEPH_LIB:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$CEPH_LIB:$DYLD_LIBRARY_PATH
 
 [ -z "$CEPH_NUM_MON" ] && CEPH_NUM_MON="$MON"
 [ -z "$CEPH_NUM_OSD" ] && CEPH_NUM_OSD="$OSD"
 [ -z "$CEPH_NUM_MDS" ] && CEPH_NUM_MDS="$MDS"
+[ -z "$CEPH_NUM_FS"  ] && CEPH_NUM_FS="$FS"
 [ -z "$CEPH_NUM_RGW" ] && CEPH_NUM_RGW="$RGW"
 
 [ -z "$CEPH_NUM_MON" ] && CEPH_NUM_MON=3
 [ -z "$CEPH_NUM_OSD" ] && CEPH_NUM_OSD=3
 [ -z "$CEPH_NUM_MDS" ] && CEPH_NUM_MDS=3
+[ -z "$CEPH_NUM_FS"  ] && CEPH_NUM_FS=1
 [ -z "$CEPH_NUM_RGW" ] && CEPH_NUM_RGW=1
 
 [ -z "$CEPH_DIR" ] && CEPH_DIR="$PWD"
@@ -275,6 +261,13 @@ esac
 shift
 done
 
+if [ "$overwrite_conf" -eq 0 ]; then
+  CEPH_NUM_MON=`awk -F= '/CEPH_NUM_MON/{print $2}' $conf_fn`
+  CEPH_NUM_OSD=`awk -F= '/CEPH_NUM_OSD/{print $2}' $conf_fn`
+  CEPH_NUM_MDS=`awk -F= '/CEPH_NUM_MDS/{print $2}' $conf_fn`
+  CEPH_NUM_RGW=`awk -F= '/CEPH_NUM_RGW/{print $2}' $conf_fn`
+fi
+
 if [ "$start_all" -eq 1 ]; then
 	start_mon=1
 	start_mds=1
@@ -436,6 +429,10 @@ if [ "$start_mon" -eq 1 ]; then
 		if [ $overwrite_conf -eq 1 ]; then
 		        cat <<EOF > $conf_fn
 ; generated by vstart.sh on `date`
+; CEPH_NUM_MON=$CEPH_NUM_MON
+; CEPH_NUM_OSD=$CEPH_NUM_OSD
+; CEPH_NUM_MDS=$CEPH_NUM_MDS
+; CEPH_NUM_RGW=$CEPH_NUM_RGW
 [global]
         fsid = $(uuidgen)
         osd pg bits = 3
@@ -627,17 +624,24 @@ EOF
 fi
 
 if [ "$start_mds" -eq 1 -a "$CEPH_NUM_MDS" -gt 0 ]; then
-    cmd="$CEPH_ADM osd pool create cephfs_data 8"
-    echo $cmd
-    $cmd
+    if [ "$CEPH_NUM_FS" -gt "1" ] ; then
+        $CEPH_ADM fs flag set enable_multiple true
+    fi
 
-    cmd="$CEPH_ADM osd pool create cephfs_metadata 8"
-    echo $cmd
-    $cmd
+    fs=0
+    for name in a b c d e f g h i j k l m n o p
+    do
+        cmd="$CEPH_ADM osd pool create cephfs_data_${name} 8"
+        $cmd
 
-    cmd="$CEPH_ADM fs new cephfs cephfs_metadata cephfs_data"
-    echo $cmd
-    $cmd
+        cmd="$CEPH_ADM osd pool create cephfs_metadata_${name} 8"
+        $cmd
+
+        cmd="$CEPH_ADM fs new cephfs_${name} cephfs_metadata_${name} cephfs_data_${name}"
+        $cmd
+        fs=$(($fs + 1))
+        [ $fs -eq $CEPH_NUM_FS ] && break
+    done
 
     mds=0
     for name in a b c d e f g h i j k l m n o p
@@ -780,7 +784,7 @@ fi
 echo "started.  stop.sh to stop.  see out/* (e.g. 'tail -f out/????') for debug output."
 
 echo ""
-echo "export PYTHONPATH=./pybind"
+echo "export PYTHONPATH=./pybind:$PYTHONPATH"
 echo "export LD_LIBRARY_PATH=$CEPH_LIB"
 
 if [ "$CEPH_DIR" != "$PWD" ]; then
diff --git a/systemd/rbdmap.service b/systemd/rbdmap.service
index 23d8fdb..25c324f 100644
--- a/systemd/rbdmap.service
+++ b/systemd/rbdmap.service
@@ -5,6 +5,8 @@ After=network-online.target local-fs.target
 Wants=network-online.target local-fs.target
 
 [Service]
+EnvironmentFile=-/etc/sysconfig/ceph
+Environment=RBDMAPFILE=/etc/ceph/rbdmap
 Type=oneshot
 RemainAfterExit=yes
 ExecStart=/usr/bin/rbdmap map

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list