[Pkg-ceph-commits] [ceph] 02/02: Imported Upstream version 10.0.2

James Downing Page jamespage at moszumanska.debian.org
Tue Feb 9 11:05:12 UTC 2016


This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch upstream
in repository ceph.

commit 6f322e54e8ce98605ffc6c0ff857644e2a5ff2d8
Author: James Page <james.page at ubuntu.com>
Date:   Mon Jan 25 15:12:46 2016 +0000

    Imported Upstream version 10.0.2
---
 AUTHORS                                            |   30 +-
 ChangeLog                                          |  457 +++-
 INSTALL                                            |   25 +-
 Makefile.in                                        |    2 +
 ceph.spec                                          |   39 +-
 ceph.spec.in                                       |   37 +-
 configure                                          |  206 +-
 configure.ac                                       |   48 +-
 doc/Makefile.am                                    |    1 +
 doc/Makefile.in                                    |    3 +
 doc/man/8/rbd-nbd.rst                              |   55 +
 doc/man/8/rbd.rst                                  |   44 +-
 man/Makefile-client.am                             |    1 +
 man/Makefile.in                                    |    3 +
 man/ceph-authtool.8                                |    2 +-
 man/ceph-clsinfo.8                                 |    2 +-
 man/ceph-conf.8                                    |    2 +-
 man/ceph-create-keys.8                             |    2 +-
 man/ceph-debugpack.8                               |    2 +-
 man/ceph-dencoder.8                                |    2 +-
 man/ceph-deploy.8                                  |    2 +-
 man/ceph-detect-init.8                             |    2 +-
 man/ceph-disk.8                                    |    2 +-
 man/ceph-fuse.8                                    |    2 +-
 man/ceph-mds.8                                     |    2 +-
 man/ceph-mon.8                                     |    2 +-
 man/ceph-osd.8                                     |    2 +-
 man/ceph-post-file.8                               |    2 +-
 man/ceph-rbdnamer.8                                |    2 +-
 man/ceph-rest-api.8                                |    2 +-
 man/ceph-run.8                                     |    2 +-
 man/ceph-syn.8                                     |    2 +-
 man/ceph.8                                         |    2 +-
 man/cephfs.8                                       |    2 +-
 man/crushtool.8                                    |    2 +-
 man/librados-config.8                              |    2 +-
 man/monmaptool.8                                   |    2 +-
 man/mount.ceph.8                                   |    2 +-
 man/osdmaptool.8                                   |    2 +-
 man/rados.8                                        |    2 +-
 man/radosgw-admin.8                                |    2 +-
 man/radosgw.8                                      |    2 +-
 man/rbd-fuse.8                                     |    2 +-
 man/{rbd-fuse.8 => rbd-nbd.8}                      |   48 +-
 man/rbd-replay-many.8                              |    2 +-
 man/rbd-replay-prep.8                              |    2 +-
 man/rbd-replay.8                                   |    2 +-
 man/rbd.8                                          |   45 +-
 selinux/Makefile.in                                |    2 +
 src/.git_version                                   |    4 +-
 src/Makefile-client.am                             |    2 -
 src/Makefile-env.am                                |   20 +-
 src/Makefile.am                                    |   19 +-
 src/Makefile.in                                    | 2233 ++++++++++++------
 src/acconfig.h.in                                  |   12 +
 src/auth/Crypto.cc                                 |    3 +-
 src/auth/cephx/CephxServiceHandler.cc              |   13 +-
 src/ceph-detect-init/Makefile.am                   |   10 +-
 src/ceph-detect-init/tests/test_all.py             |    3 +
 src/ceph-disk                                      |    4 +-
 src/ceph.in                                        |    4 +
 src/ceph_fuse.cc                                   |   28 +-
 src/ceph_mds.cc                                    |    6 +-
 src/ceph_osd.cc                                    |   56 +-
 src/client/Client.cc                               |  143 +-
 src/client/Client.h                                |    8 +-
 src/cls/Makefile-client.am                         |    6 +-
 src/cls/Makefile-server.am                         |    4 +-
 src/cls/cephfs/cls_cephfs.cc                       |   69 +
 src/cls/cephfs/cls_cephfs.h                        |   20 +
 src/cls/cephfs/cls_cephfs_client.cc                |   15 +
 src/cls/cephfs/cls_cephfs_client.h                 |    4 +
 src/cls/journal/cls_journal_types.h                |    2 +-
 src/cls/rbd/cls_rbd.cc                             |  362 ++-
 src/cls/rbd/cls_rbd.h                              |    2 +-
 src/cls/rbd/cls_rbd_client.cc                      |  708 ++++--
 src/cls/rbd/cls_rbd_client.h                       |  101 +-
 src/cls/rbd/cls_rbd_types.cc                       |   52 +
 src/cls/rbd/cls_rbd_types.h                        |   50 +
 src/common/BackTrace.h                             |    9 +
 src/common/ConfUtils.h                             |    2 +-
 src/common/Finisher.cc                             |    7 +-
 src/common/Finisher.h                              |    3 +
 src/common/Formatter.cc                            |   49 +-
 src/common/Formatter.h                             |   12 +-
 src/common/Makefile.am                             |   17 +-
 src/common/MemoryModel.cc                          |    1 +
 src/common/PluginRegistry.cc                       |  222 ++
 src/common/PluginRegistry.h                        |   70 +
 src/common/PrebufferedStreambuf.cc                 |   43 +-
 src/common/PrebufferedStreambuf.h                  |    8 +-
 src/common/Readahead.cc                            |   28 +-
 src/common/Readahead.h                             |    6 +-
 src/common/SubProcess.h                            |   95 +-
 src/common/WorkQueue.h                             |    7 +
 src/common/addr_parsing.c                          |    2 +-
 src/common/admin_socket.h                          |    2 +-
 src/common/aix_errno.cc                            |  225 ++
 src/common/buffer.cc                               |   39 +-
 src/common/ceph_context.cc                         |    6 +
 src/common/ceph_context.h                          |    9 +-
 src/common/ceph_crypto_cms.h                       |    2 +-
 src/common/config_opts.h                           |   19 +-
 src/common/dout.h                                  |    3 +-
 src/common/entity_name.h                           |    2 +-
 src/common/event_socket.h                          |   74 +
 src/common/hobject.h                               |    2 +
 src/common/io_priority.cc                          |    2 +
 src/common/lockdep.cc                              |   33 +-
 src/common/perf_counters.h                         |    1 -
 src/crush/CrushCompiler.cc                         |    4 +
 src/crush/CrushTester.cc                           |    2 +-
 src/erasure-code/ErasureCode.cc                    |    1 +
 src/erasure-code/ErasureCodeInterface.h            |    2 +-
 src/global/global_init.cc                          |    4 +
 src/global/signal_handler.cc                       |    7 +-
 src/include/Makefile.am                            |    6 +-
 src/include/assert.h                               |    2 +-
 src/include/buffer.h                               |   72 +-
 src/include/buffer_fwd.h                           |   17 +
 src/include/byteorder.h                            |    4 +
 src/include/compat.h                               |    6 +-
 src/{os/fs/XFS.h => include/event_type.h}          |   23 +-
 src/include/rados/buffer.h                         |   72 +-
 src/include/rados/buffer_fwd.h                     |   17 +
 src/include/rados/librados.h                       |    4 +-
 src/include/rbd/librbd.h                           |   44 +
 src/include/rbd/librbd.hpp                         |   23 +
 src/include/rbd_types.h                            |    7 +
 src/include/stringify.h                            |    5 +
 src/include/types.h                                |    4 +-
 src/include/utime.h                                |   12 +-
 src/init-ceph.in                                   |   17 +-
 src/init-radosgw                                   |    2 +-
 src/java/Makefile.in                               |    2 +
 src/journal/FutureImpl.cc                          |   15 +-
 src/journal/FutureImpl.h                           |    2 +-
 src/journal/JournalMetadata.cc                     |   46 +-
 src/journal/JournalPlayer.cc                       |  247 +-
 src/journal/JournalPlayer.h                        |   50 +-
 src/journal/JournalTrimmer.cc                      |   19 +-
 src/journal/JournalTrimmer.h                       |    2 +-
 src/journal/Journaler.cc                           |   47 +-
 src/journal/Journaler.h                            |   17 +-
 src/journal/ObjectPlayer.cc                        |   22 +-
 src/journal/ObjectRecorder.cc                      |   81 +-
 src/journal/ObjectRecorder.h                       |    6 +-
 src/kv/KineticStore.h                              |    2 +-
 src/kv/LevelDBStore.h                              |    2 +-
 src/kv/RocksDBStore.h                              |    2 +-
 src/librados/librados.cc                           |   17 +-
 src/librbd/AioCompletion.cc                        |   49 +-
 src/librbd/AioCompletion.h                         |   76 +-
 src/librbd/AioImageRequest.cc                      |  127 +-
 src/librbd/AioImageRequest.h                       |    7 +-
 src/librbd/AioImageRequestWQ.cc                    |  276 ++-
 src/librbd/AioImageRequestWQ.h                     |   80 +-
 src/librbd/AioObjectRequest.cc                     |   99 +-
 src/librbd/AioObjectRequest.h                      |    3 +-
 src/librbd/AsyncObjectThrottle.cc                  |    8 +-
 src/librbd/AsyncObjectThrottle.h                   |    1 -
 src/librbd/AsyncOperation.cc                       |    9 +-
 src/librbd/AsyncRequest.cc                         |   39 +-
 src/librbd/AsyncRequest.h                          |   21 +-
 src/librbd/AsyncResizeRequest.cc                   |  284 ---
 src/librbd/AsyncTrimRequest.cc                     |  361 ---
 src/librbd/CopyupRequest.cc                        |   64 +-
 src/librbd/CopyupRequest.h                         |    6 +-
 src/librbd/DiffIterate.cc                          |    4 +-
 src/librbd/ExclusiveLock.cc                        |  531 +++++
 src/librbd/ExclusiveLock.h                         |  162 ++
 src/librbd/ImageCtx.cc                             |  245 +-
 src/librbd/ImageCtx.h                              |   66 +-
 src/librbd/ImageState.cc                           |  389 ++++
 src/librbd/ImageState.h                            |  122 +
 src/librbd/ImageWatcher.cc                         |  840 +++----
 src/librbd/ImageWatcher.h                          |  233 +-
 src/librbd/Journal.cc                              |  558 +++--
 src/librbd/Journal.h                               |  117 +-
 src/librbd/JournalReplay.cc                        |  100 +-
 src/librbd/JournalReplay.h                         |   37 +-
 src/librbd/JournalTypes.cc                         |  161 ++
 src/librbd/JournalTypes.h                          |  191 +-
 src/librbd/LibrbdWriteback.cc                      |   38 +-
 src/librbd/LibrbdWriteback.h                       |    3 -
 src/librbd/Makefile.am                             |   74 +-
 src/librbd/ObjectMap.cc                            |  621 +----
 src/librbd/ObjectMap.h                             |  101 +-
 src/librbd/Utils.cc                                |   31 +
 src/librbd/Utils.h                                 |  139 ++
 src/librbd/WatchNotifyTypes.cc                     |  128 +-
 src/librbd/WatchNotifyTypes.h                      |  126 +-
 src/librbd/exclusive_lock/AcquireRequest.cc        |  456 ++++
 src/librbd/exclusive_lock/AcquireRequest.h         |  123 +
 src/librbd/exclusive_lock/ReleaseRequest.cc        |  218 ++
 src/librbd/exclusive_lock/ReleaseRequest.h         |   89 +
 src/librbd/image/CloseRequest.cc                   |  243 ++
 src/librbd/image/CloseRequest.h                    |  110 +
 src/librbd/image/OpenRequest.cc                    |  375 +++
 src/librbd/image/OpenRequest.h                     |  106 +
 src/librbd/image/RefreshParentRequest.cc           |  235 ++
 src/librbd/image/RefreshParentRequest.h            |   99 +
 src/librbd/image/RefreshRequest.cc                 |  763 +++++++
 src/librbd/image/RefreshRequest.h                  |  189 ++
 src/librbd/image/SetSnapRequest.cc                 |  342 +++
 src/librbd/image/SetSnapRequest.h                  |  121 +
 src/librbd/internal.cc                             | 2405 +++++++-------------
 src/librbd/internal.h                              |   71 +-
 src/librbd/librbd.cc                               |  228 +-
 src/librbd/object_map/InvalidateRequest.cc         |   88 +
 src/librbd/object_map/InvalidateRequest.h          |   49 +
 src/librbd/object_map/LockRequest.cc               |  154 ++
 src/librbd/object_map/LockRequest.h                |   72 +
 src/librbd/object_map/RefreshRequest.cc            |  224 ++
 src/librbd/object_map/RefreshRequest.h             |   76 +
 src/librbd/object_map/Request.cc                   |   73 +
 src/librbd/object_map/Request.h                    |   58 +
 src/librbd/object_map/ResizeRequest.cc             |   58 +
 src/librbd/object_map/ResizeRequest.h              |   48 +
 src/librbd/object_map/SnapshotCreateRequest.cc     |  148 ++
 src/librbd/object_map/SnapshotCreateRequest.h      |   79 +
 src/librbd/object_map/SnapshotRemoveRequest.cc     |  204 ++
 src/librbd/object_map/SnapshotRemoveRequest.h      |   91 +
 src/librbd/object_map/SnapshotRollbackRequest.cc   |  131 ++
 src/librbd/object_map/SnapshotRollbackRequest.h    |   77 +
 src/librbd/object_map/UnlockRequest.cc             |   66 +
 src/librbd/object_map/UnlockRequest.h              |   46 +
 src/librbd/object_map/UpdateRequest.cc             |   70 +
 src/librbd/object_map/UpdateRequest.h              |   49 +
 .../FlattenRequest.cc}                             |  110 +-
 .../FlattenRequest.h}                              |   31 +-
 .../{ => operation}/RebuildObjectMapRequest.cc     |  177 +-
 .../{ => operation}/RebuildObjectMapRequest.h      |   20 +-
 src/librbd/operation/RenameRequest.cc              |  193 ++
 src/librbd/operation/RenameRequest.h               |   90 +
 src/librbd/operation/Request.cc                    |   67 +
 src/librbd/operation/Request.h                     |   52 +
 src/librbd/operation/ResizeRequest.cc              |  310 +++
 .../ResizeRequest.h}                               |   40 +-
 src/librbd/operation/SnapshotCreateRequest.cc      |  323 +++
 src/librbd/operation/SnapshotCreateRequest.h       |  127 ++
 src/librbd/operation/SnapshotProtectRequest.cc     |  115 +
 src/librbd/operation/SnapshotProtectRequest.h      |   67 +
 src/librbd/operation/SnapshotRemoveRequest.cc      |  236 ++
 src/librbd/operation/SnapshotRemoveRequest.h       |   96 +
 src/librbd/operation/SnapshotRenameRequest.cc      |   91 +
 src/librbd/operation/SnapshotRenameRequest.h       |   66 +
 src/librbd/operation/SnapshotRollbackRequest.cc    |  273 +++
 src/librbd/operation/SnapshotRollbackRequest.h     |  101 +
 src/librbd/operation/SnapshotUnprotectRequest.cc   |  350 +++
 src/librbd/operation/SnapshotUnprotectRequest.h    |   93 +
 src/librbd/operation/TrimRequest.cc                |  385 ++++
 .../TrimRequest.h}                                 |   22 +-
 src/librbd/parent_types.h                          |    8 +-
 src/log/Entry.h                                    |   51 +-
 src/log/Log.cc                                     |   55 +-
 src/log/Log.h                                      |    1 +
 src/log/test.cc                                    |    5 +-
 src/mds/CDentry.h                                  |    2 +-
 src/mds/CDir.h                                     |    2 +-
 src/mds/CInode.cc                                  |    6 +-
 src/mds/CInode.h                                   |    2 +-
 src/mds/Capability.h                               |    2 +-
 src/mds/InoTable.h                                 |   21 +
 src/mds/Locker.cc                                  |    5 +
 src/mds/LogEvent.h                                 |    2 +-
 src/mds/MDCache.cc                                 |   13 +-
 src/mds/MDCache.h                                  |    6 +-
 src/mds/MDSAuthCaps.cc                             |   21 +-
 src/mds/MDSAuthCaps.h                              |   16 +-
 src/mds/MDSDaemon.cc                               |   15 +
 src/mds/MDSMap.cc                                  |   52 +
 src/mds/MDSMap.h                                   |   37 +-
 src/mds/MDSTable.h                                 |    2 +-
 src/mds/Server.cc                                  |   48 +-
 src/messages/MOSDOp.h                              |   16 +-
 src/messages/MOSDOpReply.h                         |   27 +-
 src/messages/MOSDRepOp.h                           |   33 +-
 src/messages/MOSDRepOpReply.h                      |   34 +-
 src/messages/MOSDSubOp.h                           |    2 +
 src/messages/MOSDSubOpReply.h                      |    3 +
 src/mon/MDSMonitor.cc                              |    9 +-
 src/mon/MonCommands.h                              |    4 +-
 src/mon/OSDMonitor.cc                              |   97 +-
 src/mount/mtab.c                                   |    2 +
 src/msg/Message.h                                  |    8 +-
 src/msg/Messenger.h                                |    1 +
 src/ocf/Makefile.in                                |    2 +
 src/os/DBObjectMap.h                               |    2 +-
 src/os/FileJournal.cc                              |   17 +-
 src/os/FileStore.cc                                |   20 +-
 src/os/FileStore.h                                 |   12 +-
 src/os/HashIndex.h                                 |    2 +-
 src/os/Journal.h                                   |    2 +-
 src/os/JournalingObjectStore.h                     |    2 +-
 src/os/KeyValueStore.cc                            |   17 +-
 src/os/KeyValueStore.h                             |    4 -
 src/os/MemStore.cc                                 |   10 +-
 src/os/MemStore.h                                  |    2 -
 src/os/ObjectStore.cc                              |   14 +
 src/os/ObjectStore.h                               |   15 +-
 src/os/WBThrottle.cc                               |    3 +-
 src/os/WBThrottle.h                                |    9 +-
 src/os/fs/XFS.h                                    |    2 +-
 src/osd/ECUtil.cc                                  |   16 +
 src/osd/ECUtil.h                                   |   17 +-
 src/osd/OSD.cc                                     |   66 +-
 src/osd/OSD.h                                      |   26 +-
 src/osd/OpRequest.cc                               |    1 +
 src/osd/PG.cc                                      |   82 +-
 src/osd/PG.h                                       |    6 +-
 src/osd/PGBackend.cc                               |    2 +-
 src/osd/PGLog.cc                                   |   19 +-
 src/osd/PGLog.h                                    |    7 +-
 src/osd/ReplicatedBackend.cc                       |    9 +-
 src/osd/ReplicatedPG.cc                            |   20 +-
 src/osd/ReplicatedPG.h                             |    5 +
 src/osd/osd_types.cc                               |  221 +-
 src/osd/osd_types.h                                |  253 +-
 src/osdc/Objecter.h                                |   11 -
 src/pybind/Makefile.am                             |   51 +
 src/pybind/cephfs.py                               |   13 +
 src/pybind/rados.py                                |   56 +-
 src/pybind/rbd.py                                  | 1262 ----------
 src/pybind/rbd.pyx                                 | 1426 ++++++++++++
 src/pybind/setup.py                                |   51 +
 src/rbd_replay/ActionTypes.h                       |    2 +-
 src/rbd_replay/ios.hpp                             |    2 +-
 src/rgw/Makefile.am                                |    4 +-
 src/rgw/rgw_acl.cc                                 |   18 +-
 src/rgw/rgw_acl.h                                  |   50 +-
 src/rgw/rgw_acl_s3.cc                              |   21 +-
 src/rgw/rgw_acl_swift.cc                           |   35 +-
 src/rgw/rgw_acl_swift.h                            |    6 +-
 src/rgw/rgw_admin.cc                               |   79 +-
 src/rgw/rgw_basic_types.cc                         |   14 +
 src/rgw/rgw_basic_types.h                          |  111 +
 src/rgw/rgw_bucket.cc                              |  157 +-
 src/rgw/rgw_bucket.h                               |   39 +-
 src/rgw/rgw_client_io.h                            |   63 +
 src/rgw/rgw_common.cc                              |   56 +-
 src/rgw/rgw_common.h                               |  102 +-
 src/rgw/rgw_dencoder.cc                            |   18 +-
 src/rgw/rgw_formats.cc                             |   49 +-
 src/rgw/rgw_formats.h                              |    3 +-
 src/rgw/rgw_http_errors.h                          |    1 +
 src/rgw/rgw_json_enc.cc                            |   35 +-
 src/rgw/rgw_log.cc                                 |   21 +-
 src/rgw/rgw_log.h                                  |   22 +-
 src/rgw/rgw_main.cc                                |    8 +-
 src/rgw/rgw_metadata.h                             |   15 +
 src/rgw/rgw_object_expirer_core.cc                 |   19 +-
 src/rgw/rgw_object_expirer_core.h                  |    3 +-
 src/rgw/rgw_op.cc                                  |  722 +++++-
 src/rgw/rgw_op.h                                   |  208 +-
 src/rgw/rgw_quota.cc                               |  103 +-
 src/rgw/rgw_quota.h                                |    4 +-
 src/rgw/rgw_rados.cc                               |  161 +-
 src/rgw/rgw_rados.h                                |   68 +-
 src/rgw/rgw_resolve.cc                             |   31 +-
 src/rgw/rgw_rest.cc                                |   98 +-
 src/rgw/rgw_rest.h                                 |   43 +-
 src/rgw/rgw_rest_bucket.cc                         |   19 +-
 src/rgw/rgw_rest_client.cc                         |    4 +-
 src/rgw/rgw_rest_conn.cc                           |   15 +-
 src/rgw/rgw_rest_conn.h                            |    6 +-
 src/rgw/rgw_rest_log.cc                            |   15 +-
 src/rgw/rgw_rest_metadata.cc                       |    4 +-
 src/rgw/rgw_rest_s3.cc                             |  129 +-
 src/rgw/rgw_rest_swift.cc                          |  250 +-
 src/rgw/rgw_rest_swift.h                           |   12 +
 src/rgw/rgw_rest_usage.cc                          |   12 +-
 src/rgw/rgw_rest_user.cc                           |   82 +-
 src/rgw/rgw_swift.cc                               |   12 +-
 src/rgw/rgw_swift.h                                |    2 +-
 src/rgw/rgw_usage.cc                               |    4 +-
 src/rgw/rgw_usage.h                                |    4 +-
 src/rgw/rgw_user.cc                                |   99 +-
 src/rgw/rgw_user.h                                 |   40 +-
 src/test/Makefile-client.am                        |   25 +-
 src/test/Makefile.am                               |    2 +
 src/test/bench/bencher.cc                          |    3 +-
 src/test/bench/bencher.h                           |    2 +-
 src/test/bufferlist.cc                             |   64 +-
 src/test/centos-6/ceph.spec.in                     |   37 +-
 src/test/centos-7/ceph.spec.in                     |   37 +-
 src/test/cli/rbd/help.t                            |  432 +++-
 src/test/cls_rbd/test_cls_rbd.cc                   |  146 +-
 src/test/common/ObjectContents.h                   |    2 +-
 src/test/common/test_tableformatter.cc             |    2 +
 src/test/encoding/readable.sh                      |  179 +-
 src/test/encoding/types.h                          |   11 +-
 src/test/erasure-code/ceph_erasure_code.cc         |    1 -
 .../erasure-code/ceph_erasure_code_benchmark.cc    |    1 -
 .../ceph_erasure_code_non_regression.cc            |    1 -
 src/test/fedora-21/ceph.spec.in                    |   37 +-
 src/test/journal/test_JournalPlayer.cc             |   45 +-
 src/test/journal/test_JournalTrimmer.cc            |    5 +-
 src/test/libcephfs/test.cc                         |   53 +-
 src/test/librados/c_read_operations.cc             |    5 +-
 src/test/librados/cls.cc                           |    1 -
 src/test/librados/cmd.cc                           |    1 -
 src/test/librados/io.cc                            |    8 +-
 src/test/librados/misc.cc                          |    1 -
 src/test/librados/test.cc                          |  136 +-
 src/test/librados/tier.cc                          |    1 -
 src/test/librados_test_stub/LibradosTestStub.cc    |   38 +
 src/test/librados_test_stub/MockTestMemIoCtxImpl.h |   17 +
 .../librados_test_stub/MockTestMemRadosClient.h    |    8 +
 src/test/librados_test_stub/TestRadosClient.h      |    2 +-
 src/test/libradosstriper/io.cc                     |    4 +-
 .../exclusive_lock/test_mock_AcquireRequest.cc     |  571 +++++
 .../exclusive_lock/test_mock_ReleaseRequest.cc     |  197 ++
 src/test/librbd/fsx.cc                             |  138 +-
 src/test/librbd/mock/MockAioImageRequestWQ.h       |   20 +
 src/test/librbd/mock/MockExclusiveLock.h           |   25 +
 src/test/librbd/mock/MockImageCtx.h                |   61 +-
 src/test/librbd/mock/MockImageWatcher.h            |   10 +-
 src/test/librbd/mock/MockJournal.h                 |   28 +
 src/test/librbd/mock/MockObjectMap.h               |    5 +
 src/test/librbd/mock/MockReadahead.h               |   21 +
 .../librbd/object_map/mock/MockInvalidateRequest.h |   42 +
 .../object_map/test_mock_InvalidateRequest.cc      |  153 ++
 .../librbd/object_map/test_mock_LockRequest.cc     |  215 ++
 .../librbd/object_map/test_mock_RefreshRequest.cc  |  251 ++
 .../librbd/object_map/test_mock_ResizeRequest.cc   |  144 ++
 .../object_map/test_mock_SnapshotCreateRequest.cc  |  221 ++
 .../object_map/test_mock_SnapshotRemoveRequest.cc  |  276 +++
 .../test_mock_SnapshotRollbackRequest.cc           |  143 ++
 .../librbd/object_map/test_mock_UnlockRequest.cc   |   67 +
 .../librbd/object_map/test_mock_UpdateRequest.cc   |  199 ++
 .../operation/test_mock_SnapshotCreateRequest.cc   |  270 +++
 .../operation/test_mock_SnapshotProtectRequest.cc  |  191 ++
 .../operation/test_mock_SnapshotRemoveRequest.cc   |  359 +++
 .../test_mock_SnapshotUnprotectRequest.cc          |  276 +++
 src/test/librbd/test_ImageWatcher.cc               |  623 +----
 src/test/librbd/test_JournalEntries.cc             |    9 +-
 src/test/librbd/test_JournalReplay.cc              |   85 +-
 src/test/librbd/test_ObjectMap.cc                  |   44 +-
 src/test/librbd/test_fixture.cc                    |   23 +-
 src/test/librbd/test_fixture.h                     |    2 +
 src/test/librbd/test_internal.cc                   |  118 +-
 src/test/librbd/test_librbd.cc                     |  425 +++-
 src/test/librbd/test_mock_ExclusiveLock.cc         |  556 +++++
 src/test/librbd/test_mock_fixture.cc               |    2 +
 src/test/librbd/test_mock_fixture.h                |   18 +
 src/test/mon/misc.sh                               |   85 +-
 src/test/multi_stress_watch.cc                     |    1 -
 src/test/objectstore_bench.cc                      |   35 +
 src/test/opensuse-13.2/ceph.spec.in                |   37 +-
 src/test/osd/TestPGLog.cc                          |   34 +
 src/test/osd/osd-scrub-repair.sh                   |   11 +-
 src/test/osd/osd-scrub-snaps.sh                    |   14 +-
 src/test/osd/types.cc                              |   59 +
 src/test/pybind/test_ceph_argparse.py              |   10 +-
 src/test/rgw/test_rgw_manifest.cc                  |    6 +-
 src/test/rgw/test_rgw_obj.cc                       |    2 +-
 src/test/test_stress_watch.cc                      |    1 -
 src/test/test_subprocess.cc                        |   28 +-
 src/tools/Makefile-client.am                       |   12 +-
 src/tools/ceph_kvstore_tool.cc                     |    7 +-
 src/tools/ceph_monstore_tool.cc                    |   53 +-
 src/tools/ceph_objectstore_tool.cc                 |    4 +-
 src/tools/cephfs/DataScan.cc                       |  512 ++++-
 src/tools/cephfs/DataScan.h                        |   97 +-
 src/tools/cephfs/TableTool.cc                      |  271 +--
 src/tools/cephfs/TableTool.h                       |   11 +-
 src/tools/rados/RadosImport.h                      |    2 +-
 src/tools/rados/rados.cc                           |    4 +-
 src/tools/rbd/ArgumentTypes.cc                     |  102 +-
 src/tools/rbd/ArgumentTypes.h                      |   39 +-
 src/tools/rbd/Shell.cc                             |    2 +-
 src/tools/rbd/Utils.cc                             |  237 +-
 src/tools/rbd/Utils.h                              |   16 +-
 src/tools/rbd/action/Clone.cc                      |   21 +-
 src/tools/rbd/action/Copy.cc                       |   17 +-
 src/tools/rbd/action/Create.cc                     |   29 +-
 src/tools/rbd/action/DiskUsage.cc                  |   14 +-
 src/tools/rbd/action/Feature.cc                    |   27 +-
 src/tools/rbd/action/Import.cc                     |   47 +-
 src/tools/rbd/action/Info.cc                       |    8 +
 src/tools/rbd/action/Journal.cc                    |  969 ++++++++
 src/tools/rbd/action/List.cc                       |   32 +-
 src/tools/rbd/action/MergeDiff.cc                  |    7 +-
 src/tools/rbd/action/MirrorPool.cc                 |  421 ++++
 src/tools/rbd/action/Nbd.cc                        |  186 ++
 src/tools/rbd_nbd/rbd-nbd.cc                       |  739 ++++++
 src/tools/scratchtool.c                            |    2 +-
 src/tracing/librbd.tp                              |   39 +
 src/vstart.sh                                      |   20 +-
 systemd/Makefile.am                                |    4 +
 systemd/Makefile.in                                |    6 +
 systemd/ceph-disk at .service                         |    2 +-
 systemd/ceph-mds.target                            |    5 +
 systemd/ceph-mds at .service                          |    4 +-
 systemd/ceph-mon.target                            |    5 +
 systemd/ceph-mon at .service                          |    4 +-
 systemd/ceph-osd.target                            |    5 +
 systemd/ceph-osd at .service                          |    4 +-
 systemd/ceph-radosgw.target                        |    5 +
 systemd/ceph-radosgw at .service                      |    4 +-
 501 files changed, 33849 insertions(+), 10032 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index b02281b..22e4399 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -38,6 +38,7 @@ Andrey Kuznetsov <Andrey_Kuznetsov at epam.com>
 Andrey Stepachev <octo at yandex-team.ru>
 Andy Allan <andy at gravitystorm.co.uk>
 Anis Ayari <ayari_anis at live.fr>
+Anthony Alba <ascanio.alba7 at gmail.com>
 Anton Aksola <anton.aksola at nebula.fi>
 Anton Blanchard <anton at samba.org>
 apovzner <apovzner at 29311d96-e01e-0410-9327-a35deaab8ce9>
@@ -77,8 +78,8 @@ Casey Bodley <cbodley at users.noreply.github.com>
 Casey Marshall <csm at soe.ucsc.edu>
 CC Lien <cc_lien at tcloudcomputing.com>
 Ce Gu <guce at h3c.com>
-ceph <zhuang.zeqiang at h3c.com>
 Cesar Mello <cesar at d1.(none)>
+changtao <changtao at hihuron.com>
 Chen Baozi <baozich at gmail.com>
 Chen Dihao <tobeg3oogle at gmail.com>
 Chendi Xue <chendi.xue at intel.com>
@@ -95,8 +96,10 @@ Christophe Courtaut <christophe.courtaut at gmail.com>
 Christopher O'Connell <jwriteclub at gmail.com>
 Christoph Hellwig <hch at infradead.org>
 Christos Stavrakakis <stavr.chris at gmail.com>
+Cilang Zhao <zhao.cilang at h3c.com>
 Claire Massot <claire.massot93 at gmail.com>
 Clement Lebrun <clement.lebrun.31 at gmail.com>
+clever215 <wu.kongming at h3c.com>
 Colin Mattson <colinmattson at gmail.com>
 Colin P. McCabe <colinm at hq.newdream.net>
 Dan Chai <tengweicai at gmail.com>
@@ -126,6 +129,8 @@ Dongmao Zhang <deanraccoon at gmail.com>
 Dongsu Park <dpark1978 at gmail.com>
 Dong Yuan <yuandong1222 at gmail.com>
 Douglas Fuller <dfuller at redhat.com>
+Drunkard Zhang <gongfan193 at gmail.com>
+Dunrong Huang <riegamaths at gmail.com>
 Eleanor Cawthon <eleanor.cawthon at inktank.com>
 Emily Popper <emily.popper at dreamhost.com>
 Eric Mourgaya <eric.mourgaya at arkea.com>
@@ -133,6 +138,7 @@ Erik Logtenberg <erik at logtenberg.eu>
 Erwin, Brock A <Brock.Erwin at pnl.gov>
 Esteban Molina-Estolano <eestolan at lanl.gov>
 Evan Felix <evan.felix at pnnl.gov>
+Evgeniy Firsov <evgeniy.firsov at sandisk.com>
 Fabio Alessandro Locati <fabiolocati at gmail.com>
 fangdong <yp.fangdong at gmail.com>
 Federico Gimenez <fgimenez at coit.es>
@@ -142,6 +148,7 @@ Feng Wang <cyclonew at cs.ucsc.edu>
 Filippos Giannakos <philipgian at grnet.gr>
 Florent Bautista <florent at coppint.com>
 Florent Flament <florent.flament at cloudwatt.com>
+Florent Manens <florent at manens.org>
 Florian Coste <fcoste21 at gmail.com>
 Florian Haas <florian at hastexo.com>
 Florian Marsylle <florian.marsylle at hotmail.fr>
@@ -162,7 +169,6 @@ Greg Farnum <gfarnum at redhat.com>
 Greg Farnum <greg at inktank.com>
 Gregory Meno <gmeno at redhat.com>
 Guangliang Zhao <guangliang at unitedstack.com>
-Guang Yang <yguang at yahoo-inc>
 Guang Yang <yguang at yahoo-inc.com>
 Guilhem Lettron <guilhem at lettron.fr>
 Haifeng Liu <haifeng at yahoo-inc.com>
@@ -171,6 +177,7 @@ Hannu Valtonen <hannu.valtonen at ormod.com>
 Haomai Wang <haomai at xsky.com>
 Harpreet Dhillon <harpreet at ironsystems.com>
 Hazem Amara <hazem.amara at telecom-bretagne.eu>
+Hector Martin <marcan at marcan.st>
 Henry C Chang <henry_c_chang at tcloudcomputing.com>
 Henry Chang <henry at bigtera.com>
 Herb Shiu <herb_shiu at tcloudcomputing.com>
@@ -181,12 +188,16 @@ Huamin Chen <hchen at redhat.com>
 Huang Jun <hjwsm1989 at gmail.com>
 Ian Holsman <lists at holsman.net>
 Ian Kelling <ian at iankelling.org>
+Igor Fedotov <ifedotov at mirantis.com>
+Igor Podoski <igor.podoski at ts.fujitsu.com>
 Ilja Slepnev <islepnev at gmail.com>
 Ilya Dryomov <idryomov at redhat.com>
 Ilya Dryomov <ilya.dryomov at inktank.com>
 Ira Cooper <ira at samba.org>
 Ismael Serrano <ismael.serrano at gmail.com>
 Jacek J. Lakis <jacek.lakis at intel.com>
+Jacek J. Łakis <jacek.lakis at intel.com>
+Jacek J. Łakis <jlakis at gklab-126-033.igk.intel.com>
 James Page <james.page at ubuntu.com>
 James Ryan Cresawn <jrcresawn at gmail.com>
 Jan Harkes <jaharkes at cs.cmu.edu>
@@ -200,6 +211,7 @@ Jeff Epstein <jepst79 at gmail.com>
 Jeff Weber <jweber at cofront.net>
 Jenkins <jenkins at ceph.com>
 Jens-Christian Fischer <jens-christian.fischer at switch.ch>
+Jeremy Qian <vanpire110 at 163.com>
 Jevon Qiao <qiaojianfeng at unitedstack.com>
 Jiang Heng <jiangheng0511 at gmail.com>
 Jianhui Yuan <zuiwanyuan at gmail.com>
@@ -261,6 +273,7 @@ Laszlo Boszormenyi <gcs at debian.hu>
 Laurent Barbe <laurent at ksperis.com>
 Lee Revell <rlrevell at gmail.com>
 Lei Dong <leidong at yahoo-inc.com>
+Lenz Grimmer <lenz at grimmer.com>
 Liam Monahan <liam at umiacs.umd.edu>
 Li Peng <lip at dtdream.com>
 Li Wang <li.wang at kylin-cloud.com>
@@ -273,6 +286,7 @@ Luis Pabón <lpabon at redhat.com>
 Lukasz Jagiello <lukasz at wikia-inc.com>
 Lu Shi <shi.lu at h3c.com>
 Ma Jianpeng <jianpeng.ma at intel.com>
+Marc Koderer <marc at koderer.com>
 Marco Garcês <marco.garces at bci.co.mz>
 Marcus Sorensen <shadowsor at gmail.com>
 Mark Kampe <mark.kampe at dreamhost.com>
@@ -378,6 +392,7 @@ Sean Channel <pentabular at gmail.com>
 Sébastien Han <shan at redhat.com>
 Sebastien Ponce <sebastien.ponce at cern.ch>
 Sergey Arkhipov <nineseconds at yandex.ru>
+Shang Ding <dingshang2013 at 163.com>
 Shanggao Qiu <qiushanggao at qq.com>
 Sharif Olorin <sio at tesser.org>
 Shawn Edwards <lesser.evil at gmail.com>
@@ -442,13 +457,15 @@ Volker Assmann <volker at twisted-nerve.de>
 VRan Liu <gliuwr at gmail.com>
 Vu Pham <vu at mellanox.com>
 Walter Huf <hufman at gmail.com>
+wangchaunhong <root at A22832429.(none)>
 Wang, Yaguang <yaguang.wang at intel.com>
 Warren Usui <warren.usui at inktank.com>
+Wei Feng <feng.wei at h3c.com>
 Weijun Duan <duanweijun at h3c.com>
-weill <weilluo at tencent.com>
 Wei Luo <luowei at yahoo-inc.com>
+Wei Luo <weilluo at tencent.com>
 Wei Qian <weiq at dtdream.com>
-wenjunhuang <wenjunhuang at tencent.com>
+Wenjun Huang <wenjunhuang at tencent.com>
 Wesley Spikes <wesley.spikes at dreamhost.com>
 Wido den Hollander <wido at 42on.com>
 William A. Kennington III <william at wkennington.com>
@@ -475,12 +492,15 @@ Yazen Ghannam <yazen.ghannam at linaro.org>
 Yehua Chen <chen.yehua at h3c.com>
 Yehuda Sadeh <yehuda at inktank.com>
 Yehuda Sadeh <ysadehwe at redhat.com>
+Yongqiang He <he.yongqiang at h3c.com>
 Yongyue Sun <abioy.sun at gmail.com>
-youji <youji at ebay.com>
+You Ji <youji at ebay.com>
 Yuan Zhou <yuan.zhou at intel.com>
 Yunchuan Wen <yunchuan.wen at kylin-cloud.com>
 Yuri Weinstein <yuri.weinstein at inktank.com>
 Zengran Zhang <zhangzengran at h3c.com>
+Zeqiang Zhuang <zhuang.zeqiang at h3c.com>
+Zhang Huan <zhanghuan at ict.ac.cn>
 Zhe Zhang <zzxuanyuan at gmail.com>
 Zhicheng Wei <zhicheng at opensourceforge.net>
 Zhi (David) Zhang <zhangz at yahoo-inc.com>
diff --git a/ChangeLog b/ChangeLog
index 05771ab..3327a42 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,32 +1,384 @@
-9180a92 (HEAD, tag: v10.0.1, origin/jewel) 10.0.1
+86764ea (HEAD, tag: v10.0.2, origin/jewel) 10.0.2
+ebdb0a9 packaging: add build dependency on python devel package
+e59293d librbd: add additional granularity to lock states
+94b7d23 librbd: initialize object map before replaying journal
+4fa53ea librbd: do not accept RPC requests while transitioning lock state
+5216146 librbd: avoid error message when peer owns exclusive lock
+eb120e8 librbd: correct include guard in RenameRequest.h
+1329a09 librbd: disable write_full optimization if image has parent
+c425bf0 tests: new test case for write_full optimization
+df79c3e librbd: directly execute copy-on-read copyup operations
+5f1a530 OSD::ScrubJob: correctly handle small osd_scrub_interval_randomize_ratio
+64cb6e5 Revert "osd: fix arithmetic exception when scrub_min_interval is small"
+67be35c OpRequest: release the message throttle when unregistered
+14fd1c7 librbd: disable copy-on-read when not exclusive lock owner
+8bfd23c tests: fsx should disable journaling feature for krbd
+089673a qa/workunits/rbd: import_export should use clean temp space
+5b065aa librbd: reduce verbosity of common error condition logging
+a3ac03e journal: disconnect watch after watch error
+7bcb744 Revert "rgw: use smart pointer for C_Reinitwatch"
+70a1392 qa/workunits/rbd: rbd-nbd test should use sudo for map/unmap ops
+b4dbaa6 librbd: do not ignore self-managed snapshot release result
+bc309d9 librbd: properly handle replay of snap remove RPC message
+98157ab librbd: ensure librados callbacks are flushed prior to destroying image
+b8d7aa2 librbd: clear error when older OSD doesn't support image flags
+e8911cf tests: flush op work queue prior to destroying MockImageCtx
+bfc9690 librbd: skip journal event if write/discard was no-op
+f6659cc librbd: object map ENOENT optimizations should complete asynchronously
+913339f librbd: update exclusive lock state when shutting down
+200d310 librbd: extra request required if journal enabled w/ caching
+bab16bb librbd: stop the copyup thread during shutdown
+69bcac5 qa/workunits/rbd: use --object-size instead of --order
+94cd34b rbd: process crushed, rbd creating with striping parameters
+8e93f3f log: Log.cc: Assign LOG_DEBUG priority to syslog calls
+35542eb Revert "makefiles: remove bz2-dev from dependencies"
+fec5458 doc: fix typo in developer guide
+d465830 mds: fix setvxattr (broken in a536d114)
+a8b5920 makefiles: remove bz2-dev from dependencies
+9409e96 ceph-fuse: add process to ceph-fuse --help
+eadc771 doc: Update list of admin/build-doc dependencies
+b1429aa mds: tear down connections from `tell` commands
+8d68d02 test/encoding/readable.sh: add non-whole type skip
+34a6c2a rgw: remove unused variable
+a57784f rgw: fix the build failure
+d2d4bb4 librbd: refresh object map after rollback
+db2d495 librbd: refresh image asynchronously from watch/notify path
+16547c9 librbd: the journal state machine is now asynchronous
+2a4aad5 tests: adjust test cases to refactored librbd
+7e3a032 librbd: all object map methods are now async
+8ef37b0 librbd: IO ops will now asynchronously refresh the image if needed
+01bc515 WorkQueue: PointerWQ now supports out-of-band processing
+18f4f69 librbd: flush all in-flight IO on block write and shut down
+bad4641 librbd: add shut down support to the AIO work queue
+f1973b1 librbd: integrate new async image state machines
+8740ddf0 librbd: ImageWatcher should release lock upon request
+c00b28b librbd: implement image state tracker for open/close/refresh/etc
+3fd24eb librbd: rename ImageRefresh to ImageState
+1673332 librbd: object map lifespan now controlled by state machines
+ccdeb15 librbd: decouple ImageWatcher from exclusive lock management
+6e1718c librbd: separate image refresh into async state machines
+22eb25b tests: exclusive lock state machines test cases
+8aabb8c librbd: separate exclusive lock handling into async state machines
+03b79ab librados_test_stub: additional definitions and mock functions
+1912774 tests: mock test cases for new async object map lock/unlock/refresh ops
+7248b93 librbd: async object map lock/unlock/refresh state machines
+700a0de librados_test_stub: additional mocked IoCtx methods
+dd253af librbd: convert object_map::InvalidateRequest to template
+8a4c155 librbd: support updated async cls_rbd API
+fc4370e cls_rbd: async version of class helpers added
+e2eb27f librbd: fix missing header dependencies for parent_types.h
+3f3a040 librados_test_stub: implemented ObjectReadOperation::stat
+8a33a89 tests: restrict rebuild object map test case
+3d25f51 librbd: moved object name helpers to librbd::util namespace
+06f2eb2 tests: integrate cleaned up librbd AIO callback adapter
+004090a librbd: cleanup librbd AIO completion callbacks
+c67725b librbd: cleanup Context and librados AIO completion callbacks
+9180a92 (tag: v10.0.1) 10.0.1
+00cfe4e tests: new integration test for validating new RBD pools
+1fea4da librbd: optionally validate RBD pool configuration (snapshot support)
+664903f journal: avoid race between flush request and in-flight flushes
+3e6c990 librbd: fix test case race condition for maintenance requests
+15c840c rbd: fix build with "--without-rbd"
+d5cee59 doc/cephfs/posix: update
+df0c1f5 journal: add iohint flags for journal replayer.
+84310c5 journal: Add iohint flags for journal write ops.
+5440334 doc: dev: introduction to tests
+09e21b4 init-ceph: do umount when the path exists.
+14e9d29 mount/mtab.cc: memory leaks the free() should be called to free resources, in order to avoid memory leaks
+58bad43 client: modify a word in log
+c36d73e doc: Modified a note section in rbd-snapshot doc.
+ad15b81 vstart.sh: mon osd reporter subtree level = osd
+8cc6301 mstop.sh, mrgw.sh: more scripts to control daemons in dev env
+df92e26 mrun: a tool to run commands against specific mstart cluster
+dfa403c mstart.sh: run multiple clusters in a single source tree
+61aa107 cmake: add run_cmake-check.sh
+9f34737 mailmap: YankunLi affiliation
+b3c28b9 mailmap: Nishtha Rai affiliation
+0828cb4 mailmap: John Coyle affiliation
+0d3605f mailmap: Jashan Kamboj affiliation
+e108b22 mailmap: Adam Kupczyk affiliation
+c65bab1 mailmap: Jacek J. Lakis affiliation
+41a18ff mailmap: runsisi affiliation
+458c59a mailmap: Zeqiang Zhuang affiliation
+e2cd81d mailmap: You Ji affiliation
+5991644 mailmap: Wenjun Huang affiliation
+5ed7d09 mailmap: Wei Luo affiliation
+ea8ad14 mailmap: Sangdi Xu name normalization
+e9aa132 mailmap: Robin H. Johnson name normalization
+fee41ad mailmap: Guang Yang name normalization
+e2f163a tests: fix failure for osd-scrub-snap.sh
+5876829 tests: kill_daemons uses TERM instead of KILL
+68125dd librbd: fix merge-diff for >2GB diff-files
+213cd78 rgw: fix a glaring syntax error
+2b50a75 Implementation of rados_ioctx_snapshot_rollback
+7f5ee33 librbd: fix test case race condition for journaling ops
+9162bd2 Revert "LifeCycle feature"
+99bcc7c tests: --osd-scrub-load-threshold=2000 for more consistency
+26008e8 ceph-detect-init: Ubuntu >= 15.04 uses systemd
+93cdc98 pybind/rbd.pyx: misc typo bugfixes
+469b55a pybind/rbd.pyx: PyString -> PyBytes
+6f1fb16 doc: osd: s/schedued/scheduled/
+7d93cf4 buffer: make usable outside of ceph source again
+72785ee rgw: enforce SLO part's ETag match during GET on SLO of Swift.
+1cf149d rgw: append X-Static-Large-Object attribute during PUT on SLO.
+630eac1 rgw: add support for normalisation of ranged SLO requests.
+31f1b89 rgw: fix wrong ofs handling during GET on Swift's SLO.
+dabf535 rgw: handle errors during GET on Swift's SLO appropriately.
+fa71082 rgw: make response for multipart delete on a SLO compliant with Swift.
+7e06dba rgw: add support for multipart delete on a SLO of Swift.
+0a36b95 rgw: fix iterate_slo_parts() and improve its declaration's formatting.
+4bab3fc rgw: move RGWBulkDelete in rgw/rgw_op.h.
+1df28c8 rgw: iterate over slo parts
+7f5fdce rgw: store slo info when putting object
+d8593b1 rgw: read and parse put slo obj request
+ffb4691 rgw: s/obj_manifest/dlo_manifest
+8eb8048 rgw: support bucket removal in Bulk Delete API implementation.
+60018d9 rgw: improve response code handling for Bulk Delete API.
+c421037 rgw: enable chunking in Bulk Delete API implementation.
+b665b04 rgw: add support for Bulk Delete API of Swift.
+2ff837f rgw: rework authorization parts to accomodate Bulk Delete API.
+4ea0c7c rgw: XMLFormatter may print XML tags lowercased and underscored now.
+3ecc2c3 rgw: RGWFormatter_Plain does support key-value style for Bulk Delete API.
+2b45910 rgw: implement std::streambuf over RGWClientIO.
+619e945 rgw: improve code formatting in Swift's DLO implementation.
+b48f3d7 rgw: fix wrong length in Content-Range HTTP header of Swift's DLO.
+66f47f3 rgw: fix wrong first byte pos in Content-Range HTTP header of Swift's DLO.
+2945bef rgw: add support for putting Swift's X-Object-Manifest through POST.
+1b6a7e3 doc: note that cephfs auth stuff is new in jewel
+2c7c712 tests: osd-scrub-snaps.sh to display full osd logs on error
+71690e7 rbd: add --object-size option
+a69367d ceph-mds:add --help/-h
+07c334a doc:file must be empty when writing layout fields of file use "setfattr"
 247ee60 build/ops: enable CR in CentOS 7
 b47eeae tests: centos7 needs the Continuous Release (CR) Repository enabled for libunwind
+7d48f62 LifeCycle feature As same as amazon S3 interface,"PUT Bucket lifecycle" and "DELETE Bucket lifecycle" have been implemented, "GET Bucket lifecycle" not realized yet as S3cmd has not realize it also. The feature`s main point is to remove expire file per day. Files transfer from hot layer to cold layer is not supported. ToDo:Maybe to transfer from replicate pool to EC pool or from ssd to sata pool will be valuable.
+cd0c13b qa/workunits/cephtool/test.sh: false positive fail on /tmp/obj1.
+a80ff1f common/ceph_context.cc:fix order of initialisers
+0186cdc rgw:dont update entrypoint when removing bucket
+a828a4d osd: fix arithmetic exception when scrub_min_interval is small
+9d88f1a osd: more scrubber fields in pg query output
+15b2405 osd: use hexdump when logging CRC values
+9af8d6f qa/krbd: Expunge generic/247
+2eb0bf3 tests: make sure no segfault occurs when using some bad keyring
+f2f92c8 cmake: add TracepointProvider.cc to libcommon
+3e23e45 doc: Fixed incorrect name of a "List Multipart Upload Parts" Response Entity
+c83d6db cmake: update for recent rbd changes
+25c70cf Update Jiaying Ren affinity
+c11ca42 Update Rongze Zhu affinity
+593c124 doc: rst style fix for pools document
+e62954e deb,rpm: package buffer_fwd.h
 1adf306 SubmittingPatches: there is no next; only jewel
+8160f9e Add common/PluginRegistry.cc to CMakeLists.txt
+c1daf4e doc/dev/index.rst: wholesale refactor
+a62b5ac pybind: support ioctx:exec
+7f81728 common/Makefile: ship common/event_socket.h
+c8f7d44 build/ops: systemd ceph-disk unit must not assume /bin/flock
 73aab5e test: use sequential journal_tid for object cacher test
+6cbf128 tests: wait for mocked requests to complete
+fde9f78 librbd: do not complete AIO callbacks within caller's thread context
+32597ca librbd: include missing header for bool type
+5622d3f rbd: alow to specify options for created journal
+47f5610 librbd: journal options on image create/clone/copy
+89c716f librbd: output operator for image options
+e72fc02 librbd: journal: mark entry committed after replay
+e8a584f rbd: add new journal rbd commands
+7311185 librbd: humanize ImageWatcher and Journal states in debug log
+bb10815 librbd: add image pool ID to journal object names.
+61ad15c librbd: allow alternate pool for journal objects
+ebadfef librbd: debug: log journal metadata
+7248fb5 librbd: new config options to tweak journal settings
+e9317f0 SubProcess: fix multiple definition bug
+02a9a41 osd: Test osd_find_best_info_ignore_history_les config in another assert
+04b4795 test/librados/test.cc: clean up EC pools' crush rules too
+6807b35 doc/dev/index.rst: fix links
+f336640 doc/dev/index.rst: Flesh out the IRC section
+8d281bc doc/dev/index.rst: Make Issue tracker a separate chapter
+91f01bd librbd: partial revert of commit 9b0e359
+4e15c03 pybind/rbd.pyx: Return something useful from str() and repr()
+6e7f9c8 doc/dev/index.rst: start writing Bugfixing chapter
+b328131 cmake: add rgw_basic_types.cc to librgw.a
+002d26b doc/dev/index.rst: rewrite introductory material
+e4218a2 KeyValueStore: Don't queue NULL context under test purpose
+ba346bf qa: add a test for the recovery_priority/recovery_op_priority pool settings.
+f0cc722 mon: support recovery_priority and recovery_op_priority with monitor commands
+cd2bc41 Objecter: clean up Objecter.h/ObjectOperation
+053ee91 PGLog::rewind_divergent_log: fix rollback_info_trimmed_to before index()
+66c7246 TestPGLog: add test for 13965
+7455897 osd: prioritize recovery based on pool's customized priority
+bc21a23 doc/dev/index.rst: fix headings
+b6f9a9b doc/dev/index.rst: begin writing Contributing to Ceph
 fb120d7 osd: call on_new_interval on newly split child PG
+34b3283 common: lockdep now tracks lock ordering when backtraces disabled
+60519e2 librbd: correct lock ordering issues discovered by lockdep
+1997144 doc: remove unnecessary period in headline
+2a0263f journal: correct lock ordering issues discovered by lockdep
+4230504 rgw: remove comments
+eda44cd rgw: a minor cleanup
+bf0a7b4 rgw: don't re-set bucket tenant and name when selecting location
+ddb4caa rgw: fix a typo
+2287ce9 rgw: don't allow cross-tenant bucket creation
+e5bfd94 rgw: avoid calling rgw_make_bucket_entry_name() when not needed
+9425b04 rgw: objexp hint name backward compatibility
+e8de349 rgw: inherit bucket tenant from user if not specified
+acda806 rgw: add a missing cap type
+bc091ed pybind/rbd.pyx: only set self.closed after a successful close
+d10c61a pybind/rbd.pyx: remove redundant RBD.__init__ method
+8931875 ceph.spec: Cython is spelled python-Cython in OpenSuSE
+07ae545 pybind: Improvements to Cython build
+ba449e3 tests: added new rbd mirroring CLI commands
+7e6fd07 rbd: add new 'mirror pool' rbd commands
+bfe295f tests: librbd new pool mirror API methods
+2a3cdbf librbd: new pool mirror API methods
+869bc8c ceph-dencoder: new cls::rbd::MirrorPeer type
+c45a3ff test: new cls_rbd methods for pool mirroring
+54b1c8d cls_rbd: new methods for handling mirroring
+b7b3693 tests: initial set of test cases for op state machines
+f8a9aef librbd: snap create doesn't properly handle race conditions
+9809c3c librbd: use md_ctx for self-managed snapshot management
+835989c librbd: convert op state machines to templates
+5b63666 tests: add gmock unit tests for librbd object map state machines
+dfbcbc7 librbd: ensure object map is invalidated on disk if forced
+dc2b176 common: PluginRegistry modification
+5eb64dd common: PluginRegistry
+e8002c3 configure.ac: macro fix
 e9daed2 rgw: use smart pointer for C_Reinitwatch
 c4fbec7 rgw: fix partial read mime map issue
+e04beab tools/cephfs: enable tag filter in DataScan
+9e71aba tools/cephfs: fix datascan kwarg parsing
+786d111 cls_cephfs: add PGLSCephFSFilter
+9c266e5 common: re-enable backtrace support
+e20ef4b ceph_test_libcephfs: don't check order of dentries in readdir result
+8a80652 client: don't choose dirfrag when opendir
+b0deee4 Update man page for new rbd nbd command
+dcb78b2 update test_librbd_fsx to support rbd-nbd
+2c7cc53 add rbd-nbd man page reference
+330c7f8 add rbd-nbd wrap to rbd CLI
+bff706f cls/cls_rbd.cc: no need to skip key == after.
+685fc1c journal: fire replay complete event after reading last object
 f914b8d rgw: fix rgw_admin partial read issue
+61f61a2 osd: add recovery_priority and recovery_op_priority to pool_opts_t
+13925c7 init-ceph.in: Allow custom cluster names during startup.
+708ec2b tests and tools/scratchtool: Don't attempt to use NULL xattr
+4ec0def librados.cc: rados_getxattrs_next: don't try to use malloc(0)
+db9ec69 osd: mark osd backend type in osd_data dir
+880a59d osd: make block device fsid probing generic
+999b24f Implemented log message size predictor. It tracks size of log messages. Initial allocation size is derived from last log message from the same line of code. Fixed bug in test.
+9cca28a pybind: convert librbd bindings to Cython
+8655cca debian/control: make python-rbd depend on minimum librbd version
+0f5ce5e Makefile: break out local targets into variables
+5113c96 configure/packaging: introduce Cython dependency
+2f36909 init-ceph: fix systemd-run cant't start ceph daemon sometimes
+2ab79d7 tools/cephfs: fix layout handling in injection
+184914b cmake: define STRERROR_R_CHAR_P for GNU-specific strerror_r
+e27109b doc: INSTALL redirect to online documentation
+35403da qa/workunits/cephtool/test.sh: no ./
 88e6694 osd: fix ClassHandler::ClassData::get_filter()
+5b35eb8 PG: fix message refcounter in replica_scrub
+9fba402 doc: typo fix in cephfs/quota
+fd684ca librbd/JournalReplay: Only Send signal when aio completions queue empty.
+2548ad8 test: fix osd-scrub-snaps.sh
+ce2d3c7 test: add pg_scrub for ceph-helpers.sh
+c57ceff librbd: Fix rebase with new io flow
+5cc9ff9 librbd: fix lttng tracing argument mismatch
+1da3093 librbd: Add set_event_notify to AioImageRequestWQ
+bddab3e librbd: Remove unneeded set_event_notify
+d625b89 Makefile: Add noinst headr files
+97507d5 librbd: Make rbd header file uses independent enum definition
+c42044a Librbd: Make AioCompletion complete doesn't unlock if callback
+c470e0b Librbd: fix return code of EventSocket init and notify
+d19deac librbd: Add ictx check to avoid AIO_TYPE_NONE completion
+3932714 librbd: normalize notify return code
+6d11474 librbd: Fix incorrect api declaration
+ca32fc5 librbd: check event_notify to avoid extra logic
+c7703db EventSocket: Add new event type pipe support
+0dc9321 test: Add tests for getting arg of completion
+1ab727d librbd: Add interface to let user can get private data from comp
+e7a1506 tests: Add tests for user io event notify
+c3a1edb librbd: Add event notify interfaces
+169cd4d osd: dump number of missing objects for each peer with pg query
+9b0e359 librbd: automatically flush IO after blocking write operations
+3c6e692 include/rados/librados.h: fix typo
+6f6fd2f librbd: utilize common flush helper when closing parent images
+39c5b70 librbd: only enqueue flush completion if a flush is pending
+ccf7c29 Gentoo: _FORTIFY_SOURCE fix.
+206b8e2  common/buffer: forward declare buffer
+aa38700 common/buffer: changed buffer from class to namespace
+083fdbf librbd: check for presence of journal before attempting to remove
+ac35e84 journal: helper method to determine if journal exists
+644d600 librbd: not necessary to hold owner_lock while releasing snap id
 9d06041 rbd: bail if too many arguments provided
 d133f42 rbd: don't append an extra newline after some errors
 1c84681 tests: update unmap.t CLI test
 5ce663f cmake: librbd needs libjournal and libcls_journal_client
+77aef0d rgw: remove unused variable in RGWPutMetadataBucket::execute.
+117e630 Correct typo 'restared' to 'restarted'
+691199b CodingStyle: fix broken URLs
+6b402f5 ceph::buffer, Add cached_crc and cached_crc_adjust count in perf dump in order to track the hit rate and efficiency of crc_cache
+bcb8f36 mon: support min_down_reporter conuted by subtree level
+43c1ba7 tools: fix cephfs-data-scan scan_frags vs. nlink
 06b3b47 mon/PGMonitor: MAX AVAIL is 0 if some OSDs' weight is 0
 b2eefca os: FileStore::_destroy_collection may hide the real mistake.
 04c0360 Fix mon routed_request_tids leak
 baf9da3 pybind: decode empty string in conf_parse_argv() correctly
 903350c ceph_test_keyvaluedb_iterators: Fix broken test
+f79e289 tests: fix race condition testing auto scrub
+26752f9 cmake: update for recent librbd changes
+a7f520c auth: fix a crash issue due to CryptoHandler::create() failed
+e9e0533 auth: fix double PK11_DestroyContext() if PK11_DigestFinal() failed
+574e319 Test:bencher ops counter doesn't increase Signed-off-by: Tao Chang <changtao at hihuron.com>
 0b474c5 mon: don't require OSD W for MRemoveSnaps
+e6dcf14 osd: store per pool scrub intervals in pool options
+2b252d2 tests: workunits should not have ./ (assume it in $PATH)
+73077dd osd: pg_pool_t: add dictionary for pool options
+09c0d8d librbd: fix tracepoint parameter
+014e2f0 ceph.spec.in: use %tmpfiles_create macro
+8a9db37 test/encoding/readable.sh fix
+f76d5d6 pybind: decode empty string in conf_parse_argv() correctly
+9331e03 test: use sequential journal_tid for object cacher test
+720ac2b EventSocket: Add EventSocket structure used for event notification
+f5e0cce osd: don't update rollback_info for replicated pool rollback_info is just needed for ec-pool to rollback the patial committed chunks to previous version. Avoid recording rollback_info in replicated pool to save cpu cost and disk bandwidth
+788077f List: close formatter session on error exit.
+2b390fc osd: don't update unneccessary epoch for pg epoch always remains unless state of cluster changes. Therefore, avoid update epoch for every Op in order to same cpu cost and disk bandwidth.
+75f1412 DiskUsage: close formatter session on error exit
+573151f doc: Fixes a spelling error
+b96c7e6 aix shared library build
 3680dc3 mon/OSDMonitor: block 'ceph osd pg-temp ...' if update is pending
+7a8fd0e tools: add cephfs-table-tool 'take_inos'
+9bd0b11 rgw: fix tenant/bucket parsing
+f7ca00a rgw: make APIs to work with tenants
+1f19b60 rgw: buckets within tenant namespace
+788477a rgw: user has a tenant property
+13a12a5 rgw: add an inspection to the field of type when assigning user caps
+3369a83 librbd: simplify IO method signatures for 32bit environments
+ba3c64c Fix mon routed_request_tids leak
+e242d84 mds: remove MDCache::cap_import_paths
+15b2aca tests: ceph-helpers assert success getting backfills
+3c87598 MOSDOp/MOSDOpReply: Move MOSDOp and MOSDOpReply newest version decoding to the front of decoding function.
+889158a WBThrottle: fix incorrect throttle
+922fea7 client: s/close_sessions/_close_sessions/
+a4924d4 ceph.spec.in: add BuildRequires: systemd
+07a7483 client: close mds sessions in shutdown()
 095c29c ceph.spec.in: make --with lowmem_builder limit _smp_mflags
+56da106 Test:bencher wrong test margin casuses writes over object_size
+917d85f osbench: Adds handling for the lack of required folders ( data & journal ) and adds checking for previous data presence to avoid assertion
+2902030 osbench: Fix race condition that may cause Sequencer::dtor assertion on benchmark completion
+daae180 Doubled marking from line 1151
+ada6e32 osd: slightly reduce actual size of pg_log_entry_t
+d1c9bf6 journal: support replay passed skipped splay objects
+56100ef tests: verify that journal player can handle skipped journal objects
 1509ada mailmap: Jenkins affiliation
 d92f611 mailmap: Burkhard Linke affiliation
 27f81d4 mailmap: Chen Dihao affiliation
 8dc6748 mailmap: Wei Qian affiliation Signed-off-by: Yann Dupont <yann at objoo.org>
+39032ba qa: erasure-code-benchmark technique and plugin selection
+a6433cc qa: erasure-code has --erasure-code-dir
+8789eb9 add aix compile warning
+5b1f962 initialized backtrace variables
 f86eb3f mds: fix scrub_path
 4025f75 doc/release-notes: fix typo
 efbcd12 doc/release-notes: final v10.0.0 notes
+48a71a7 mon: do not ignore a failure report cancellation form osd
+c0c5a6e mon: fix osd failure info in mon
 5972a44 doc: fix message typos in systemd
 9aabc8a test/mon/osd-crush.sh: escape ceph tell mon.*
 72edab2 osd: make some of the pg_temp methods/fields private
@@ -83,32 +435,89 @@ b3ca828 osd: fix send_failures() locking
 12c7e54 osd: no need for regular send_pg_temps
 19b714f osd: just send alive when it is queue
 d5a2f9a osd: fix pg stat reporting
+ea41e48 tools/cephfs: add scan_frags to DataScan
+0151fec tools: refactor DataScan injection
+b7faf67 tools:support printing the crushmap in readable fashion.
+a3a0e1c aix gcc librados port
 f74e310 osd: Only add random deep scrubs when NOT user initiated scrub
 4c19abd Revert "test: osd-scrub-snaps.sh: Randomized deep-scrubs can now happen during a scrub"
 0fe26c2 test: osd-scrub-snaps.sh: Randomized deep-scrubs can now happen during a scrub
+328c663 SubProcess: update to use new constructor
+000306e SubProcess: include iostream
+40b5bcb SubProcess: allow CLOSE/PIPE/KEEP parent std fd
+fda3f7e add rbd-nbd test case
+46a06eb modify stuff follow rbd-fuse
+7bbd54a add rbd-nbd package
+5ac1cbf add rbd-nbd doc
+37f1e84 add rbd-nbd tool
+8e2831b rgw: Remove unused code in PutMetadataAccount:execute
 07f68b5 Typo in the apt-get command. Signed-off-by: Chris Holcombe <xfactor973 at gmail.com>
 3193ee1 scripts: ceph-release-notes for development versions
 c44ab62 release-notes: draft v10.0.0 release notes
+9359847 librbd: commit journal op events immediately
+18713e6 mon/PGMonitor: MAX AVAIL is 0 if some OSDs' weight is 0
 1420a1f doc: add v0.80.11 to the release timeline
 9e9b03e doc/releases: add v0.80.11 to release table
 4b5afe5 doc/release-notes: final v0.80.11 notes
 6316ff8 10.0.0
 99ba661 13207: Rados Gateway: Anonymous user is able to read bucket with authenticated read ACL
 1536cb0 osd: note down the number of missing clones
+9109f14 common/hobject.h: don't reverse bits in zero
 3b146f5 RadosClient: reimplement the pool alignment methods using the new ones
 1633d3e doc: Update ceph-disk manual page to remove some option description.
+ac84faa librbd: improve debug output for object map state machines
+aa02a07 librbd: owner_lock should be held while opening parent image
+6f94bde librbd: ImageWatcher shouldn't block the notification thread
+da473a7 tests: add missing test cases for rbd exclusive-lock RPC
+2d340da librbd: possible recursive reader lock during resize
+3d6cde0 librbd: possible unit test race condition
+63f9ae3 tests: updated librbd::Journal API
+cd3d056 librbd: journal op requests
+c4247f5 librbd: new journal types for maintenance commands
+c1bca63 tests: added snap protect/unprotect and rename to client update test
+59a7615 tests: new watch/notify rename image tests
+8351ed0 librbd: added async image rename op
+dceb6cf librados_test_stub: added ObjectWriteOperation::tmap_update
+8c8a917 tests: update cls_rbd test for async dir_rename_image method
+5e20ffe cls_rbd: dir_rename_image is now async
+cea6a23 tests: new tests for watch/notify snap protect/unprotect ops
+983d12b librbd: snap protect/unprotect now connected to watch/notify
+8311504 librbd: async ops should return status via the Context
+8a15fa6 tests: new ImageWatcher tests for snap protect/unprotect and rename
+01abb20 librbd: new watch/notify types for snap protect/unprotect and rename
+f02e446 librbd: rename librbd::WatchNotify namespace to librbd::watch_notify
+daf36f0 librbd: do not send duplicate lock event
+9df7e39 librbd: drain op work queue after shutting down journal
+d1541d6 librbd: migrate object map snapshot ops to async state machines
+6326d07 cls_rbd: add async version of object_map_load
+36032f8 librbd: consolidate object map invalidation to new state machine
+7b1170a librbd: move object map async ops to standalone classes
+4944f20 librbd: initial conversion of snapshot ops to async versions
 d3d139b doc: Update ceph-disk manual page with new feature deactivate/destroy.
 9cbe132 pep8 changes
 cb18a10 Add test cases to validate symlinks pointing to devs
 b3c7cb0 Compare parted output with the dereferenced path
 7d6002b Cephfs will crash if enabling async msg because of an assertion
+d075628 tests: update cls_rbd tests for async snapshot methods
 04e3810 osd: partial revert of "ReplicatedPG: result code not correctly set in some cases."
+22a0824 cls_rbd: convert snapshot methods to async versions
+6a12b95 librbd: new intermediate request class for handling journal ops
+ff92427 librbd: move all image operation state machines to new namespace
+a701d5e librbd: remove AsyncRequest::safely_cancel
+2e17bdc librbd: simplify state machine lock assumptions
+daa7594 librbd: async version of AioImageRequestWQ::block_writes
+c62a07a librbd: removed Async* prefix from request state machines
 f92f741 librbd: copy operation needs to use AIO work queue for writes
 ee7c6f7 librbd: simplify IO flush handling
 cb634df librbd: possible deadlock attempting to drain parent image WQs
 b118d7d WorkQueue: PointerWQ drain no longer waits for other queues
 5875345 tools/cephfs: use snprintf in Dumper
 5c2815e tools/cephfs: enlarge dump header
+5bfe05a Speed optimizations. Merged 3 writes into 1. Got rid of std::string construction. More unification on syslog,stderr,fd.
+ce9a596 mds: consider client's flushing caps when choosing lock states
+655ae79 mds: choose EXCL state for filelock when client has Fb capability
+9ab61b2 client: cancel revoking caps when reconnecting the mds
+9e9770c os: FileStore::_destroy_collection may hide the real mistake.
 2a3040b ceph-disk: remove the redundant try except and minor nits
 b954c51 tests: ceph-disk: add wait_for_osd_down() in ceph-disk-test.py of qa
 0f892e6 tests: ceph-disk: modify the ceph-disk qa test cases
@@ -125,6 +534,7 @@ d490fe9 tests: ceph-disk: Make unit test coverage all ceph-disk destroy/deactiva
 be471a2 ceph-disk: use `ceph osd dump` to check osd status
 f064622 ceph-disk: add destroy feature
 3fcdf41 ceph-disk: add deactivate feature
+1a62667 admin/build-doc: add lxml dependencies on debian
 00a9ce7 tests: fix typo in TestClsRbd.snapshots test case
 2622993 (tag: v10.0.0) 10.0.0
 5aa840a rbd: support negative boolean command-line optionals
@@ -144,6 +554,8 @@ f58ffdc tests: new rbd CLI command aliases
 cf408a3 ceph_test_msgr: Use send_message instead of keepalive to wakeup connection
 b7df772 osd: randomize deep scrubbing
 511435f client: avoid creating orphan object in Client::check_pool_perm()
+64b104c client: fix deadlock related to async pagecache invalidation
+ce03694 ceph_test_keyvaluedb_iterators: Fix broken test
 f02a51f scrub: do not assign value if read error for ECBackend
 8bb61d3 scrub: do not assign value if read error for ReplicatedBackend
 a8b7464 osdservice: state changed to atomic_t to decrease thread context switch.
@@ -183,9 +595,13 @@ c73e96a radosgw-admin: fix cli tests
 4ff0368 osdmaptool: fix cli tests
 a5b0465 crushtool: fix cli tests
 b7bb216 crushtool: fix cli test help
+2829e9d doc: flesh out MDS auth docs
+a536d11 mds: apply MAY_SET_POOL in request handling
+eee4b8f mds: add MAY_SET_POOL in MDSAuthCaps
 0533cf9 osd: fix wrong use of right parenthesis
 ef011da Update .organizationmap
 0fd8de3 msg/async: support of non-block connect in async messenger
+f7f55e3 scrub: compare omap_digest with each other
 785e58e scrub: clarify the result report
 a3aa565 journal: avoid holding lock while marking ops are complete
 4719696 cmake: updates for refactored librbd IO path
@@ -249,6 +665,7 @@ b85a5fe librbd: rename AioRequest classes to AioObjectRequest
 3a9b869 pybind/rbd.py: add new journaling feature code
 750771c librbd: add new RBD_FEATURE_JOURNALING feature code
 eb020b6 os: write file journal optimezation
+7318384 stringify: Enable optimization for GCC only
 102539e librbd: API: options on image create: update tests
 c3be44e librbd: API: options on image create
 4052282 cmake: add nss as a suffix for pk11pub.h
@@ -261,8 +678,10 @@ d911641 journal: update allocated tid when skipping committed entry in player
 0669cba use new api and fix some wrong flag caller
 628f69f save init flags to CephContext
 925596a osd: check do_shutdown before do_restart
+44b1488 stringify: Reduce CPU usage by reusing stringstream in stringify function
 016ed34 rados: Minor output changes for consistency across operations
 3ea903e cmake: fix files list
+5ed8cdc tools:print the map infomation in human readable format.
 a1b690d cls::journal: fixup: constify dump functions
 0b261e2 journal: call metadata shutdown on journal remove
 0dd6e0f journal: don't use object_number when comparing positions
@@ -302,6 +721,7 @@ d548b5f mon: revert MonitorDBStore's WholeStoreIteratorImpl::get
 f018928 revise organization
 d290b27 osd: trivial optimization
 d28698b osd: fix trivial bug
+1ace4d0 auth: keyring without mon entity type should return -EACCES             test:                          see test.sh:test_mon_caps                          before modify:                          when we first exec ../qa/workunits/cephtool/test.sh -t mon_caps --asok-does-not-need-root , it stuck.                          after modify:                          exec again, return Permission denied.
 f7f5a08 internal: remove unused local variables
 c8fe5ae librados: cast oid to object explicitly before call ioctx methods Cast oid to object explicitly before call ioctx methods. Signed-off-by: xie xingguo <xie.xingguo at zte.com.cn>
 e986ade IoCtxImpl: remove unused variable sName
@@ -310,9 +730,11 @@ a5651b8 Revert 0374bb4a2f5054d606e4aba2d97b5e6765e781b0
 9689fe0 kv: fix string ctor usage
 bfeb90e librbd: fixed deadlock while attempting to flush AIO requests
 a9729d9 tests: new test case to catch deadlock on RBD image refresh
+22d2732 MOSDRepOpReply: Simple Messenger optimization
 d33842d tests: librbd: admin socket commands to flush and invalidate cache
 0996f9d librbd: flush and invalidate cache via admin socket
 39503f5 librbd: perf counter for cache invalidates
+c6ee5fa MOSDRepOp: Simple Messenger optimization
 3b39226 tests: fix typo in TestClsRbd.snapshots test case
 8ad594f tracing: fix librados signed/unsigned warnings
 057d39a os/osd: disable extra iterator validation
@@ -332,6 +754,7 @@ db85bdd FileStore: support multiple ondisk finish and apply finisher
 f33282e doc/releases-notes: fix build error
 9224ac2 rbdmap: systemd support
 1b000ab rgw: fix reload on non Debian systems.
+7e11ef7 mds:the ceph-mds command option "--hot-standby" is useless.
 78dbd13 ceph.spec.in: add new cls_journal RADOS class
 ea4971c journal: FutureImpl shouldn't hold lock while invoking callbacks
 ea275cc tests: journal updates to support C++11
@@ -396,6 +819,7 @@ ba39d33 rbd: move rbd to tools/rbd subdirectory
 c0980af rbdmap: Move do_map and do_unmap shell functions to rbdmap script
 88e0b2c AsyncConnection: Let receiver ack message ASAP
 508bd87 librados: wrongly passed in argument for stat command
+102f0b1 auth/cephx: large amounts of log are produced by osd if the auth of osd is deleted when the osd is running, the osd will produce large amounts of log.
 619d804 FileStore::_check_replay_guard avoids double check on replaying and can_checkpoint() Already checked in _check_replay_guard, avoid double check in the inner function _check_global_replay_guard
 c228bd2 [mailmap] add member info. Signed-off-by: Xiaowei Chen <chen.xiaowei at h3c.com>
 b0536eb librbd : fix enable objectmap feature issue
@@ -427,7 +851,12 @@ f8b2fb9 mds: inode_t: add scrub stamp and version for latest complete scrub
 1627b45 MDSContinuation: remove expectation that it's using an MDR
 b789edd mdstypes: dentry_key_t: add an is_valid() function
 e09e548 mds: CDir: rearrange constructor
+c4f68b0 common/buffer.h: removed unneeded list destructor
 bb2ecea (tag: v9.2.0, origin/infernalis) 9.2.0
+a77bfd0 mds: refactor availability check
+8f0d796 mds: don't use g_conf from MDSMap
+21f5af0 client: a better check for MDS availability
+7db7eff OSD / ShardData: Pass ctx to mutex constructors in sdata and sdata_ordering lock to allow gain perfcounter values.
 da48dbb rbd: fix clone issue when we specify image feature
 a603429 tests: test/librados/test.cc must create profile
 d5be20b librbd: resize should only update image size within header
@@ -437,6 +866,10 @@ ab46d79 tests: add destroy_ec_profile{,_pp} helpers
 e382c67 init-rbdmap: Rewrite to use logger + clean-up
 5a6117e Objecter: remove redundant result-check of _calc_target in _map_session.
 8655416 Objecter: potential null pointer access when do pool_snap_list.
+5def4b7 osd: reoder fields in ObjectContext and ObjectContext::RWState structs
+476a685 osd: reoder fields in object_copy_cursor_t struct
+7401124 osd: reoder fields in ScrubMap::object struct
+48d424c osd: reoder fields in pg_stat_t struct
 b9ac90d osd/PG: tolerate missing epoch key
 c9681fd osd: merge local_t and op_t tnx to single one
 43ba820 mon:honour last seen election epoch in win_standalone_election()
@@ -464,6 +897,8 @@ e0b3965 osd: Don't crash if OI_ATTR attribute is missing or corrupt
 9e48e18 osd: Additional _scrub() check for snapset inconsistency
 3b381ca osd: Better SnapSet scrub checking (find issues instead of asserting)
 a23036c osd: Make the _scrub routine produce good output and detect errors properly
+b3f8d56 osd: reoder fields in ObjectRecoveryProgress struct
+bf3c30c osd: reorder and trim fields SnapSetContext
 e0fd540 rgw:swift use Civetweb ssl can not get right url
 b698a76 rgw: Fix typo in RGWHTTPClient::process error message
 173bfd0 rgw: link against system openssl (instead of dlopen at runtime)
@@ -478,6 +913,7 @@ f22f4ac mailmap: Xie Xingguo affiliation
 976a24a crush/mapper: ensure bucket id is valid before indexing buckets array
 4300f2a krbd: remove deprecated --quiet param from udevadm
 f46f7dc run_cmd: close parent process console file descriptors
+cb2d454 rgw/rgw_resolve: musl libc does not implement res_nquery. Added fallback to res_query.
 6f960fd rgw: add x-amz-request-charged header
 7fcd423 osd: check OSDSuperblock in mkfs() when it already have superblock
 e684e42 osd: test mkfs failure when the osd try to do mkfs again.
@@ -489,10 +925,12 @@ ace7dd0 FileStore: potential memory leak if _fgetattrs fails
 f119fb5 doc: download GPG key from download.ceph.com
 ed88d88 doc/release-notes: v0.94.5
 284f4df release-notes: draft v0.94.5 release notes
+73beb7f client: don't invalidate page cache when inode is no longer used
 dd31d4a rocksdb: remove rdb source files from dist tarball
 1e4b37d release-notes: draft v0.80.11 release notes
 ce95ce1 tools: ceph-release-notes support multiple issues
 a704c5d vstart.sh: grant full access to Swift testing account
+cfa2d0a fine-grained control systemd to start/stop/restart ceph services at once
 56d6929 KeyValueStore: fix the name's typo of keyvaluestore_default_strip_size
 545e4b2 osd: Fix log message name of ceph-objectstore-tool
 631469c Revert "Speed optimizations. Merged 3 writes into 1."
@@ -567,6 +1005,7 @@ f824c93 osd/ReplicatePG: skip flush/evict pinned objects
 5cd10e4 osd: force promotion when pin an object in cache tier
 af8d6ec osd: add support of pin/unpin objects in cache tier
 28b7205 rados: add the support of pin/unpin object in cache tier
+fa3822c pybind/cephfs: add symlink and its unit test
 19d0a59 Fix Makefile in example/librados file.
 904c0e9 pybind: Use basestring as string type for Python 2
 ab6b923 pybind: Add Python 3 support for rados and rbd modules
@@ -599,6 +1038,8 @@ cc0fcba test_rgw_admin: musl libc defines stdout as read-only. Use freopen for o
 c7d96a5 osd: init objecter after we authenticate
 d6803b8 drop envz.h includes
 5e81140 client: sys/file.h includes for flock operations
+c418265 os/filestore: The usage of __SWORD_TYPE hinders portability because it's not standardized. Removed __SWORD_TYPE dependency.
+1bab773 os/fs: The usage of __SWORD_TYPE hinders portability because it's not standardized. Removed __SWORD_TYPE dependency.
 e138e78 common/MemoryModel: Alpine is a linux variant but does not implement mallinfo(). Added explicit feature check.
 c40754b compat: use prefixed typeof to support stricter environments
 4f7bcab assert: __STRING is not defined by musl libc. Define __STRING when it is missing.
@@ -670,6 +1111,7 @@ b4c5620 doc: remove toctree items under Create CephFS
 8855e60 ReplicatedPG::maybe_handle_cache_detail: always populate missing_oid
 da4803e ReplicatedPG::_rollback_to: handle block on full correctly
 be35ea9 release-notes: draft v0.94.4 release notes
+c1d48ff osd: use pg id (without shard) when referring the PG
 2b7ddde osd: Correct the object_info_t::decode() version
 03078ba rgw: location constraints should return api name
 a077301 mon/OSDMonitor: put crushtool error in log
@@ -701,6 +1143,8 @@ c2a83d0 ceph-dencoder: new rbd_replay trace file types
 e692773 rgw: add support for skipping manifest parsing during GET on Swift object.
 a52383d client: don't mark_down on command reply
 1e57e6d mds/Session: use projected parent for auth path check
+a1f19678 tests: add test for history alloc counter in bufferlist
+4014e31 common: perf counter for bufferlist history total alloc
 116bc83 ceph_test_libcephfs: parse env properly (access)
 9489359 ceph_test_libcephfs: parse CEPH_ARGS properly
 21236ac release-notes: draft v0.94.4 release notes
@@ -752,6 +1196,8 @@ a6a6923 osdc/Objecter: send FULL_TRY and FULL_FORCE ops despite full flag
 6cf34a3 mon: drop any ops from closed sessions in dispatch_op
 a875826 mon: always set up session; move waitlist logic
 e2e1bd9 mds: avoid emitting cap warnings before evicting session
+392ca8b filestore: add objectstore finisher name
+2c04989 common: add latency perf counter for finisher
 8e930e3 messages/MOSDOp: avoid uninit/undecoded fields in print()
 362b18a mon: fix msg leak in resend_routed_requests
 c9dad52 Mon: Fix decoded message leak when this monitor is leader
@@ -909,6 +1355,7 @@ e52204c client: fix quote enforcement on subdir mounts
 15e19a4 client: refactor quota check functions
 e7f277b rgw/rgw_admin: Checking the legality of the params There is no messages When some params are invalid. so the Program should be added the function which checks params, if the params are invalid, the program will give some messages.
 f1d8a8f Objecter: repeated free op->ontimeout.
+4a0e56f tools/ceph-kvstore-tool: handle wrong command line argv
 0635b13 Objecter: maybe access wild pointer(op) in _op_submit_with_budget.
 482d4e5 AsyncConnection: Add new debug log
 a1eb380 osd/ReplicatedPG: fix ENOSPC checking
@@ -1207,6 +1654,8 @@ ab4232b rgw: init_rados failed leads to repeated delete
 e48cec3 mon: disable gmt_hitset if not supported
 02f4461 test: mon: mon-scrub.sh: test 'mon scrub'
 8c2dfad osd: force promote for ops which ec base pool can't handle
+8c4323c PerfCounter: Make l_os_queue_lat contains the complete queue latency
+834842c OSD: Add perf counter to count osd thread prepare latency
 70d3108 mon: MonitorDBStore: make get_next_key() work properly
 07a64b9 ceph.spec.in: refrain from duplicating %{_sbindir}/rcceph
 e808904 tests: drop docker-tests.sh root and /dev support
@@ -1272,6 +1721,7 @@ bf82c65 cmake: check_TESTPROGRAMS tests running
 d506bf1 vstart: add -c argument to radosgw-admin commands
 e54f896 ceph.spec.in: drop redundant centos from conditionals
 75f2a98 ceph.spec.in: clean up suse_version conditionals
+af8b3da Messenger: Make fast dispatch message set dispatch timestamp
 929ca5b ceph.spec.in: drop lsb-release dependency from ceph-common
 557e581   mon/MonClient: fix error in 'ceph ping mon.id'   Fixes: #12442
 f65267c rgw : setting max number of buckets for users via ceph.conf option
@@ -1667,6 +2117,7 @@ b610588 ceph.spec.in: remove obsolete SUSE-specific code
 df21a6e osd: expose PGLSFilter in objclass interface
 c318129 ceph.spec.in: Restart services only if they are running
 55cec07 Messenger: Fix rand() generate the same sequence numbers
+ef1434a Narrow journal aio_lock locking scope in write_aio_bl
 15e5ebe common: fix code format
 2d2f0eb test: add test case for insert empty ptr when buffer rebuild
 fb1b6dd common: fix insert empty ptr when bufferlist rebuild
@@ -19591,7 +20042,7 @@ a1cfe74 client: Mods to fix #3184 for messenger shutdown
 eb27f9a Add howto for changing man pages
 d37ca79 mon: update 'auth' help/usage
 1d552a4 rados: fix man page
-8740ddf doc: fix rpm url (part deux)
+8740ddf9 doc: fix rpm url (part deux)
 6c5c939 librbd: fix includes for portability
 c9266d6 rgw: check that realloc succeeded
 4513397 ReplicatedPG: track incoming pushes with perf counters
@@ -23329,7 +23780,7 @@ cf279a8 workunits: print tests pjd runs
 798ef38 osd: delay pg list on a snapid until missing is empty
 e2a9450 obsync: add swift support to obsync
 d21f4ab msgr: turn up socket debug printouts
-891025e udev: drop device number from name
+891025e5 udev: drop device number from name
 a5606ca pybind: trivial fix of missing argument
 e4db129 crush: whitespace
 808763e osdmap: initialize cluster_snapshot_epoch
diff --git a/INSTALL b/INSTALL
index 089ef83..6cabcda 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,25 +1,12 @@
 Installation Instructions
 *************************
 
-When pulling from git, use the --recursive option to include sub-modules:
+To use Ceph
 
-$ git clone --recursive https://github.com/ceph/ceph.git
+  Read online        http://docs.ceph.com/docs/master/start/
+  Read from sources  doc/start
 
-And then build the configure script with:
+To build from sources
 
-$ ./autogen.sh
-
-Then the usual:
-
-$ ./configure
-$ make
-
-Note that if the FUSE library is not found, the user-space fuse client
-will not be built.
-
-If you are doing development, you may want to do
-
-$ CXXFLAGS="-g -pg" ./configure
-
-or similar to avoid the default (-g -O2), which includes optimizations
-(-O2).
+  Read online        http://docs.ceph.com/docs/master/install/build-ceph/
+  Read from sources  doc/install/build-ceph.rst
diff --git a/Makefile.in b/Makefile.in
index 4419415..ccf6967 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -442,6 +442,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -528,6 +529,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
diff --git a/ceph.spec b/ceph.spec
index c6c5e9d..57092d4 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -43,6 +43,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # /var/run/ceph.
 %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
 %global _with_systemd 1
+%{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
 # LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
@@ -54,7 +55,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # common
 #################################################################################
 Name:		ceph
-Version:	10.0.1
+Version:	10.0.2
 Release:	0%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
@@ -106,6 +107,11 @@ BuildRequires:	boost-devel
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
+%if 0%{?suse_version}
+BuildRequires:	python-Cython
+%else
+BuildRequires:	Cython
+%endif
 BuildRequires:	gdbm
 BuildRequires:	hdparm
 BuildRequires:	leveldb-devel > 1.2
@@ -121,6 +127,7 @@ BuildRequires:	parted
 BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-devel
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
@@ -138,6 +145,7 @@ BuildRequires:	yasm
 %if 0%{?_with_systemd}
 BuildRequires:  pkgconfig(systemd)
 BuildRequires:	systemd-rpm-macros
+BuildRequires:	systemd
 %{?systemd_requires}
 %endif
 PreReq:		%fillup_prereq
@@ -253,6 +261,15 @@ Requires:	librbd1 = %{epoch}:%{version}-%{release}
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
+%package -n rbd-nbd
+Summary:	Ceph RBD client base on NBD
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-nbd
+NBD based client to map Ceph rbd images to local device
+
 %package radosgw
 Summary:	Rados REST gateway
 Group:		Development/Libraries
@@ -628,6 +645,10 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
   install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
   install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-osd.target $RPM_BUILD_ROOT%{_unitdir}/ceph-osd.target
+  install -m 0644 -D systemd/ceph-mon.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mon.target
+  install -m 0644 -D systemd/ceph-mds.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mds.target
+  install -m 0644 -D systemd/ceph-radosgw.target $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw.target
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
@@ -786,6 +807,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_unitdir}/ceph-radosgw at .service
 %{_unitdir}/ceph-disk at .service
 %{_unitdir}/ceph.target
+%{_unitdir}/ceph-osd.target
+%{_unitdir}/ceph-mon.target
+%{_unitdir}/ceph-mds.target
+%{_unitdir}/ceph-radosgw.target
 %else
 %{_initrddir}/ceph
 %endif
@@ -939,7 +964,7 @@ exit 0
 
 %post -n ceph-common
 %if 0%{?_with_systemd}
-systemd-tmpfiles --create --prefix=/run/ceph
+%tmpfiles_create %{_tmpfilesdir}/ceph-common.conf
 %endif
 
 %postun -n ceph-common
@@ -967,6 +992,12 @@ fi
 %{_mandir}/man8/rbd-fuse.8*
 
 #################################################################################
+%files -n rbd-nbd
+%defattr(-,root,root,-)
+%{_bindir}/rbd-nbd
+%{_mandir}/man8/rbd-nbd.8*
+
+#################################################################################
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
@@ -1057,6 +1088,7 @@ fi
 %{_includedir}/rados/librados.h
 %{_includedir}/rados/librados.hpp
 %{_includedir}/rados/buffer.h
+%{_includedir}/rados/buffer_fwd.h
 %{_includedir}/rados/page.h
 %{_includedir}/rados/crc32c.h
 %{_includedir}/rados/rados_types.h
@@ -1122,7 +1154,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 #################################################################################
 %files -n python-rbd
 %defattr(-,root,root,-)
-%{python_sitelib}/rbd.py*
+%{python_sitearch}/rbd.so
+%{python_sitearch}/rbd-*.egg-info
 
 #################################################################################
 %files -n libcephfs1
diff --git a/ceph.spec.in b/ceph.spec.in
index 2939fef..52c5c1d 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -43,6 +43,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # /var/run/ceph.
 %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
 %global _with_systemd 1
+%{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
 # LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
@@ -106,6 +107,11 @@ BuildRequires:	boost-devel
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
+%if 0%{?suse_version}
+BuildRequires:	python-Cython
+%else
+BuildRequires:	Cython
+%endif
 BuildRequires:	gdbm
 BuildRequires:	hdparm
 BuildRequires:	leveldb-devel > 1.2
@@ -121,6 +127,7 @@ BuildRequires:	parted
 BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-devel
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
@@ -138,6 +145,7 @@ BuildRequires:	yasm
 %if 0%{?_with_systemd}
 BuildRequires:  pkgconfig(systemd)
 BuildRequires:	systemd-rpm-macros
+BuildRequires:	systemd
 %{?systemd_requires}
 %endif
 PreReq:		%fillup_prereq
@@ -253,6 +261,15 @@ Requires:	librbd1 = %{epoch}:%{version}-%{release}
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
+%package -n rbd-nbd
+Summary:	Ceph RBD client base on NBD
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-nbd
+NBD based client to map Ceph rbd images to local device
+
 %package radosgw
 Summary:	Rados REST gateway
 Group:		Development/Libraries
@@ -628,6 +645,10 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
   install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
   install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-osd.target $RPM_BUILD_ROOT%{_unitdir}/ceph-osd.target
+  install -m 0644 -D systemd/ceph-mon.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mon.target
+  install -m 0644 -D systemd/ceph-mds.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mds.target
+  install -m 0644 -D systemd/ceph-radosgw.target $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw.target
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
@@ -786,6 +807,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_unitdir}/ceph-radosgw at .service
 %{_unitdir}/ceph-disk at .service
 %{_unitdir}/ceph.target
+%{_unitdir}/ceph-osd.target
+%{_unitdir}/ceph-mon.target
+%{_unitdir}/ceph-mds.target
+%{_unitdir}/ceph-radosgw.target
 %else
 %{_initrddir}/ceph
 %endif
@@ -939,7 +964,7 @@ exit 0
 
 %post -n ceph-common
 %if 0%{?_with_systemd}
-systemd-tmpfiles --create --prefix=/run/ceph
+%tmpfiles_create %{_tmpfilesdir}/ceph-common.conf
 %endif
 
 %postun -n ceph-common
@@ -967,6 +992,12 @@ fi
 %{_mandir}/man8/rbd-fuse.8*
 
 #################################################################################
+%files -n rbd-nbd
+%defattr(-,root,root,-)
+%{_bindir}/rbd-nbd
+%{_mandir}/man8/rbd-nbd.8*
+
+#################################################################################
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
@@ -1057,6 +1088,7 @@ fi
 %{_includedir}/rados/librados.h
 %{_includedir}/rados/librados.hpp
 %{_includedir}/rados/buffer.h
+%{_includedir}/rados/buffer_fwd.h
 %{_includedir}/rados/page.h
 %{_includedir}/rados/crc32c.h
 %{_includedir}/rados/rados_types.h
@@ -1122,7 +1154,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 #################################################################################
 %files -n python-rbd
 %defattr(-,root,root,-)
-%{python_sitelib}/rbd.py*
+%{python_sitearch}/rbd.so
+%{python_sitearch}/rbd-*.egg-info
 
 #################################################################################
 %files -n libcephfs1
diff --git a/configure b/configure
index 00b462c..b1c790f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 10.0.1.
+# Generated by GNU Autoconf 2.69 for ceph 10.0.2.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='10.0.1'
-PACKAGE_STRING='ceph 10.0.1'
+PACKAGE_VERSION='10.0.2'
+PACKAGE_STRING='ceph 10.0.2'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -647,6 +647,8 @@ PYTHON_VERSION
 PYTHON
 WITH_BUILD_TESTS_FALSE
 WITH_BUILD_TESTS_TRUE
+WITH_EVENTFD_FALSE
+WITH_EVENTFD_TRUE
 systemd_unit_dir
 group_rgw
 user_rgw
@@ -761,6 +763,7 @@ PTHREAD_CC
 acx_pthread_config
 COMPILER_HAS_VTA_FALSE
 COMPILER_HAS_VTA_TRUE
+RDYNAMIC_FLAG
 WARN_ERROR_FORMAT_SECURITY
 WARN_IGNORED_QUALIFIERS
 WARN_TYPE_LIMITS
@@ -769,6 +772,7 @@ WITH_BETTER_YASM_ELF64_TRUE
 WITH_GOOD_YASM_ELF64_FALSE
 WITH_GOOD_YASM_ELF64_TRUE
 YASM_CHECK
+CYTHON_CHECK
 ENABLE_SERVER_FALSE
 ENABLE_SERVER_TRUE
 ENABLE_CLIENT_FALSE
@@ -785,6 +789,8 @@ WITH_SELINUX_FALSE
 WITH_SELINUX_TRUE
 WITH_CEPHFS_FALSE
 WITH_CEPHFS_TRUE
+WITH_CYTHON_FALSE
+WITH_CYTHON_TRUE
 WITH_RBD_FALSE
 WITH_RBD_TRUE
 WITH_RADOS_FALSE
@@ -792,6 +798,8 @@ WITH_RADOS_TRUE
 AM_CXXFLAGS
 CLANG_FALSE
 CLANG_TRUE
+AIX_FALSE
+AIX_TRUE
 SOLARIS_FALSE
 SOLARIS_TRUE
 DARWIN_FALSE
@@ -950,6 +958,7 @@ enable_dependency_tracking
 enable_silent_rules
 with_rados
 with_rbd
+with_cython
 with_cephfs
 with_radosgw
 with_selinux
@@ -988,6 +997,7 @@ with_systemd_libexec_dir
 with_rgw_user
 with_rgw_group
 with_systemd_unit_dir
+with_eventfd
 '
       ac_precious_vars='build_alias
 host_alias
@@ -1560,7 +1570,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 10.0.1 to adapt to many kinds of systems.
+\`configure' configures ceph 10.0.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1631,7 +1641,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 10.0.1:";;
+     short | recursive ) echo "Configuration of ceph 10.0.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1670,6 +1680,7 @@ Optional Packages:
                         (or the compiler's sysroot if not specified).
   --with-rados            build with librados support
   --with-rbd              build rbd files
+  --with-cython           build python bindings for librbd
   --with-cephfs           build cephfs files
   --with-radosgw          build RADOS gateway
   --with-selinux          build SELinux policy
@@ -1708,6 +1719,7 @@ Optional Packages:
   --with-systemdsystemunitdir=DIR
                           systemd unit directory [SYSTEMD_UNIT_DIR] Defaults
                           to the correct value for debian /etc/systemd/system/
+  --without-eventfd       disable eventfd [default=no]
 
 Some influential environment variables:
   CXX         C++ compiler command
@@ -1817,7 +1829,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 10.0.1
+ceph configure 10.0.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2893,7 +2905,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 10.0.1, which was
+It was created by ceph $as_me 10.0.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -16388,7 +16400,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='10.0.1'
+ VERSION='10.0.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16910,6 +16922,9 @@ freebsd*)
 solaris*)
 	solaris="yes"
 	;;
+aix*)
+	aix="yes"
+	;;
 esac
  if test x"$linux" = x"yes"; then
   LINUX_TRUE=
@@ -16943,6 +16958,14 @@ else
   SOLARIS_FALSE=
 fi
 
+ if test x"$aix" = x"yes"; then
+  AIX_TRUE=
+  AIX_FALSE='#'
+else
+  AIX_TRUE='#'
+  AIX_FALSE=
+fi
+
 
 # Checks for programs.
 ac_ext=cpp
@@ -17321,6 +17344,23 @@ fi
 
 #AS_IF([test "$with_rbd" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RBD])])
 
+
+# Check whether --with-cython was given.
+if test "${with_cython+set}" = set; then :
+  withval=$with_cython;
+else
+  with_cython=yes
+fi
+
+ if test "$with_cython" = "yes"; then
+  WITH_CYTHON_TRUE=
+  WITH_CYTHON_FALSE='#'
+else
+  WITH_CYTHON_TRUE='#'
+  WITH_CYTHON_FALSE=
+fi
+
+
 # cephfs?
 # cephfs requires rados
 
@@ -17533,6 +17573,53 @@ fi
 
 #AS_IF([test "$enable_server" = "yes"], [AC_DEFINE([WITH_MON, WITH_OSD, WITH_MDS, ENABLE_SERVER])])
 
+# cython is required to build librbd python bindings
+if test x"$with_cython" = xyes; then
+    # Extract the first word of "cython", so it can be a program name with args.
+set dummy cython; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CYTHON_CHECK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CYTHON_CHECK"; then
+  ac_cv_prog_CYTHON_CHECK="$CYTHON_CHECK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CYTHON_CHECK="yes"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CYTHON_CHECK=$ac_cv_prog_CYTHON_CHECK
+if test -n "$CYTHON_CHECK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CYTHON_CHECK" >&5
+$as_echo "$CYTHON_CHECK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    if test x"$CYTHON_CHECK" != xyes; then
+        { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cython not found
+See \`config.log' for more details" "$LINENO" 5; }
+    fi
+fi
+
 # cond-check if snappy-devel is installed, needed by leveldb that is need by server parts of the project
 if test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" = "yes" \); then :
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for snappy_compress in -lsnappy" >&5
@@ -17867,6 +17954,47 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 }
 
+{
+	ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+	my_cflags_save="$CFLAGS"
+	CFLAGS="$my_cflags_save -rdynamic"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -rdynamic" >&5
+$as_echo_n "checking whether $CC accepts -rdynamic... " >&6; }
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }; RDYNAMIC_FLAG="-rdynamic"
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+	CFLAGS="$my_cflags_save"
+	ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+}
+
 # Check for compiler VTA support
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fvar-tracking-assignments" >&5
 $as_echo_n "checking whether C compiler accepts -fvar-tracking-assignments... " >&6; }
@@ -19018,6 +19146,9 @@ See \`config.log' for more details" "$LINENO" 5; }
 fi
 
 
+$as_echo "#define HAVE_RES_NQUERY 1" >>confdefs.h
+
+
 KEYUTILS_LIB=""
 if test x"$linux" = x"yes"; then :
 
@@ -20565,7 +20696,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 20568 "configure" */
+/* #line 20699 "configure" */
 public class Test {
 }
 EOF
@@ -23488,6 +23619,7 @@ for ac_header in  \
 	sys/cdefs.h \
 	syslog.h \
 	utime.h \
+	execinfo.h
 
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
@@ -23572,13 +23704,20 @@ fi
 
 
 # splice/tee
-ac_fn_c_check_func "$LINENO" "splice" "ac_cv_func_splice"
+case "${target_os}" in
+aix*)
+	# AIX splice() is something else
+	;;
+*)
+	ac_fn_c_check_func "$LINENO" "splice" "ac_cv_func_splice"
 if test "x$ac_cv_func_splice" = xyes; then :
 
 $as_echo "#define CEPH_HAVE_SPLICE /**/" >>confdefs.h
 
 fi
 
+	;;
+esac
 
 # F_SETPIPE_SZ in fcntl.h
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for F_SETPIPE_SZ in fcntl.h" >&5
@@ -24434,7 +24573,38 @@ else
 fi
 
 
+# Force not to use eventfd
+
+# Check whether --with-eventfd was given.
+if test "${with_eventfd+set}" = set; then :
+  withval=$with_eventfd;
+else
+  with_eventfd=yes
+fi
 
+if test "x$with_eventfd" != xno; then :
+  for ac_header in sys/eventfd.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "sys/eventfd.h" "ac_cv_header_sys_eventfd_h" "$ac_includes_default"
+if test "x$ac_cv_header_sys_eventfd_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_SYS_EVENTFD_H 1
+_ACEOF
+
+$as_echo "#define HAVE_EVENTFD 1" >>confdefs.h
+
+fi
+
+done
+
+fi
+ if  test "$with_eventfd" = "yes" ; then
+  WITH_EVENTFD_TRUE=
+  WITH_EVENTFD_FALSE='#'
+else
+  WITH_EVENTFD_TRUE='#'
+  WITH_EVENTFD_FALSE=
+fi
 
 
 # Checks for typedefs, structures, and compiler characteristics.
@@ -25024,6 +25194,10 @@ if test -z "${SOLARIS_TRUE}" && test -z "${SOLARIS_FALSE}"; then
   as_fn_error $? "conditional \"SOLARIS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${AIX_TRUE}" && test -z "${AIX_FALSE}"; then
+  as_fn_error $? "conditional \"AIX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${CLANG_TRUE}" && test -z "${CLANG_FALSE}"; then
   as_fn_error $? "conditional \"CLANG\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -25036,6 +25210,10 @@ if test -z "${WITH_RBD_TRUE}" && test -z "${WITH_RBD_FALSE}"; then
   as_fn_error $? "conditional \"WITH_RBD\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_CYTHON_TRUE}" && test -z "${WITH_CYTHON_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_CYTHON\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_CEPHFS_TRUE}" && test -z "${WITH_CEPHFS_FALSE}"; then
   as_fn_error $? "conditional \"WITH_CEPHFS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -25196,6 +25374,10 @@ if test -z "${VALGRIND_ENABLED_TRUE}" && test -z "${VALGRIND_ENABLED_FALSE}"; th
   as_fn_error $? "conditional \"VALGRIND_ENABLED\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_EVENTFD_TRUE}" && test -z "${WITH_EVENTFD_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_EVENTFD\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_BUILD_TESTS_TRUE}" && test -z "${WITH_BUILD_TESTS_FALSE}"; then
   as_fn_error $? "conditional \"WITH_BUILD_TESTS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -25597,7 +25779,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 10.0.1, which was
+This file was extended by ceph $as_me 10.0.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -25663,7 +25845,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 10.0.1
+ceph config.status 10.0.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 5ba7d42..f391a4d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [10.0.1], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [10.0.2], [ceph-devel at vger.kernel.org])
 
 AX_CXX_COMPILE_STDCXX_11(, mandatory)
 
@@ -70,11 +70,15 @@ freebsd*)
 solaris*)
 	solaris="yes"
 	;;
+aix*)
+	aix="yes"
+	;;
 esac
 AM_CONDITIONAL(LINUX, test x"$linux" = x"yes")
 AM_CONDITIONAL(FREEBSD, test x"$freebsd" = x"yes")
 AM_CONDITIONAL(DARWIN, test x"$darwin" = x"yes")
 AM_CONDITIONAL(SOLARIS, test x"$solaris" = x"yes")
+AM_CONDITIONAL(AIX, test x"$aix" = x"yes")
 
 # Checks for programs.
 AC_PROG_CXX
@@ -121,6 +125,12 @@ AC_ARG_WITH([rbd],
 AM_CONDITIONAL(WITH_RBD, test "$with_rbd" = "yes")
 #AS_IF([test "$with_rbd" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RBD])])
 
+AC_ARG_WITH([cython],
+	[AS_HELP_STRING([--with-cython], [build python bindings for librbd])],
+	[],
+	[with_cython=yes])
+AM_CONDITIONAL(WITH_CYTHON, test "$with_cython" = "yes")
+
 # cephfs?
 # cephfs requires rados
 AC_ARG_WITH([cephfs],
@@ -197,6 +207,14 @@ AC_ARG_ENABLE([server],
 AM_CONDITIONAL(ENABLE_SERVER, test "$enable_server" = "yes")
 #AS_IF([test "$enable_server" = "yes"], [AC_DEFINE([WITH_MON, WITH_OSD, WITH_MDS, ENABLE_SERVER])])
 
+# cython is required to build librbd python bindings
+if test x"$with_cython" = xyes; then
+    AC_CHECK_PROG(CYTHON_CHECK, cython, yes)
+    if test x"$CYTHON_CHECK" != xyes; then
+        AC_MSG_FAILURE([cython not found])
+    fi
+fi
+
 # cond-check if snappy-devel is installed, needed by leveldb that is need by server parts of the project
 AS_IF([test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" = "yes" \)],
 	[AC_CHECK_LIB([snappy], [snappy_compress], [true], [AC_MSG_FAILURE([libsnappy not found])])])
@@ -266,6 +284,8 @@ AC_CHECK_CC_FLAG([-Wtype-limits], [WARN_TYPE_LIMITS])
 AC_CHECK_CC_FLAG([-Wignored-qualifiers], [WARN_IGNORED_QUALIFIERS])
 AC_CHECK_CC_FLAG([-Werror=format-security], [WARN_ERROR_FORMAT_SECURITY])
 
+AC_CHECK_CC_FLAG([-rdynamic], [RDYNAMIC_FLAG])
+
 # Check for compiler VTA support
 AX_CHECK_COMPILE_FLAG([-fvar-tracking-assignments], [HAS_VTA_SUPPORT=1], [HAS_VTA_SUPPORT=0])
 AM_CONDITIONAL(COMPILER_HAS_VTA, [test "$HAS_VTA_SUPPORT" = 1])
@@ -342,6 +362,7 @@ if test x"$resolv_libs" != "xok"; then
   fi
 fi
 AC_SUBST([RESOLV_LIBS])
+AC_DEFINE(HAVE_RES_NQUERY, 1, [Define if you have res_nquery])
 
 dnl check for libkeyutils on linux
 KEYUTILS_LIB=""
@@ -963,6 +984,7 @@ AC_CHECK_HEADERS([ \
 	sys/cdefs.h \
 	syslog.h \
 	utime.h \
+	execinfo.h
 ])
 
 # name_to_handle_at
@@ -1000,9 +1022,16 @@ AC_CHECK_MEMBER([struct stat.st_mtimespec.tv_nsec],
     [Define if you have struct stat.st_mtimespec.tv_nsec])])
 
 # splice/tee
-AC_CHECK_FUNC([splice],
-	[AC_DEFINE([CEPH_HAVE_SPLICE], [], [splice(2) is supported])],
-	[])
+case "${target_os}" in
+aix*)
+	# AIX splice() is something else
+	;;
+*)
+	AC_CHECK_FUNC([splice],
+		[AC_DEFINE([CEPH_HAVE_SPLICE], [], [splice(2) is supported])],
+		[])
+	;;
+esac
 
 # F_SETPIPE_SZ in fcntl.h
 AC_MSG_CHECKING([for F_SETPIPE_SZ in fcntl.h])
@@ -1294,8 +1323,15 @@ AC_ARG_WITH(
     ]
 )
 
-
-
+# Force not to use eventfd
+AC_ARG_WITH([eventfd],
+            [AS_HELP_STRING([--without-eventfd], [disable eventfd [default=no]])],
+            ,
+            [with_eventfd=yes])
+AS_IF([test "x$with_eventfd" != xno],
+    [AC_CHECK_HEADERS(sys/eventfd.h,
+                     [AC_DEFINE(HAVE_EVENTFD, 1, [Have eventfd extension.])])])
+AM_CONDITIONAL(WITH_EVENTFD, [ test "$with_eventfd" = "yes" ])
 
 # Checks for typedefs, structures, and compiler characteristics.
 #AC_HEADER_STDBOOL
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 344bd89..4b15b9d 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -27,6 +27,7 @@ EXTRA_DIST = \
 	man/8/radosgw.rst	\
 	man/8/rados.rst	\
 	man/8/rbd-fuse.rst	\
+	man/8/rbd-nbd.rst	\
 	man/8/rbd-replay-many.rst	\
 	man/8/rbd-replay-prep.rst	\
 	man/8/rbd-replay.rst	\
diff --git a/doc/Makefile.in b/doc/Makefile.in
index 950177c..13fb9a0 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -156,6 +156,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -242,6 +243,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
@@ -351,6 +353,7 @@ EXTRA_DIST = \
 	man/8/radosgw.rst	\
 	man/8/rados.rst	\
 	man/8/rbd-fuse.rst	\
+	man/8/rbd-nbd.rst	\
 	man/8/rbd-replay-many.rst	\
 	man/8/rbd-replay-prep.rst	\
 	man/8/rbd-replay.rst	\
diff --git a/doc/man/8/rbd-nbd.rst b/doc/man/8/rbd-nbd.rst
new file mode 100644
index 0000000..635b343
--- /dev/null
+++ b/doc/man/8/rbd-nbd.rst
@@ -0,0 +1,55 @@
+:orphan:
+
+=========================================
+ rbd-nbd -- map rbd images to nbd device
+=========================================
+
+.. program:: rbd-nbd
+
+Synopsis
+========
+
+| **rbd-nbd** [-c conf] [--nbds_max *limit*] [--read-only] [--device *nbd device*] map *image-spec* | *snap-spec*
+| **rbd-nbd** unmap *nbd device*
+| **rbd-nbd** list-mapped
+
+Description
+===========
+
+**rbd-nbd** is a client for RADOS block device (rbd) images like rbd kernel module.
+It will map a rbd image to a nbd (Network Block Device) device, allowing access it
+as regular local block device.
+
+Options
+=======
+
+.. option:: -c ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup.
+
+.. option:: --nbds_max *limit*
+
+   Override the parameter of NBD kernel module when modprobe, used to
+   limit the count of nbd device.
+
+Image and snap specs
+====================
+
+| *image-spec* is [*pool-name*]/*image-name*
+| *snap-spec*  is [*pool-name*]/*image-name*\ @\ *snap-name*
+
+The default for *pool-name* is "rbd".  If an image name contains a slash
+character ('/'), *pool-name* is required.
+
+Availability
+============
+
+**rbd-nbd** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`rbd <rbd>`\(8)
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index cd14180..633d89a 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -10,7 +10,7 @@ Synopsis
 ========
 
 | **rbd** [ -c *ceph.conf* ] [ -m *monaddr* ] [--cluster *cluster name*]
-  [ -p | --pool *pool* ] [--size *size* ] [ --order *bits* ] [ *command* ... ] 
+  [ -p | --pool *pool* ] [--size *size* ] [ --object-size *B/K/M* ] [ *command* ... ] 
 
 
 Description
@@ -69,10 +69,11 @@ Parameters
 
    Specifies the size (in M/G/T) of the new rbd image.
 
-.. option:: --order bits
+.. option:: --object-size B/K/M
+
+   Specifies the object size in B/K/M, it will be rounded up the nearest power of two.
+   The default object size is 4 MB, smallest is 4K and maximum is 32M.
 
-   Specifies the object size expressed as a number of bits, such that
-   the object size is ``1 << order``. The default is 22 (4 MB).
 
 .. option:: --stripe-unit size-in-B/K/M
 
@@ -177,17 +178,17 @@ Commands
   require querying the OSDs for every potential object within the image.
 
 :command:`info` *image-spec* | *snap-spec*
-  Will dump information (such as size and order) about a specific rbd image.
+  Will dump information (such as size and object size) about a specific rbd image.
   If image is a clone, information about its parent is also displayed.
   If a snapshot is specified, whether it is protected is shown as well.
 
-:command:`create` (-s | --size *size-in-M/G/T*) [--image-format *format-id*] [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *image-spec*
+:command:`create` (-s | --size *size-in-M/G/T*) [--image-format *format-id*] [--object-size *B/K/M*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *image-spec*
   Will create a new rbd image. You must also specify the size via --size.  The
   --stripe-unit and --stripe-count arguments are optional, but must be used together.
 
-:command:`clone` [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*] [--image-shared] *parent-snap-spec* *child-image-spec*
+:command:`clone` [--object-size *B/K/M*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*] [--image-shared] *parent-snap-spec* *child-image-spec*
   Will create a clone (copy-on-write child) of the parent snapshot.
-  Object order will be identical to that of the parent image unless
+  Object size will be identical to that of the parent image unless
   specified. Size will be the same as the parent snapshot. The --stripe-unit
   and --stripe-count arguments are optional, but must be used together.
 
@@ -219,11 +220,11 @@ Commands
 :command:`export` (*image-spec* | *snap-spec*) [*dest-path*]
   Exports image to dest path (use - for stdout).
 
-:command:`import` [--image-format *format-id*] [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *src-path* [*image-spec*]
+:command:`import` [--image-format *format-id*] [--object-size *B/K/M*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *src-path* [*image-spec*]
   Creates a new image and imports its data from path (use - for
   stdin).  The import operation will try to create sparse rbd images 
   if possible.  For import from stdin, the sparsification unit is
-  the data block size of the destination image (1 << order).
+  the data block size of the destination image (object size).
 
   The --stripe-unit and --stripe-count arguments are optional, but must be
   used together.
@@ -258,7 +259,7 @@ Commands
 
 :command:`cp` (*src-image-spec* | *src-snap-spec*) *dest-image-spec*
   Copies the content of a src-image into the newly created dest-image.
-  dest-image will have the same size, order, and image format as src-image.
+  dest-image will have the same size, object size, and image format as src-image.
 
 :command:`mv` *src-image-spec* *dest-image-spec*
   Renames an image.  Note: rename across pools is not supported.
@@ -321,6 +322,15 @@ Commands
 :command:`showmapped`
   Show the rbd images that are mapped via the rbd kernel module.
 
+:command:`nbd map` [--device *device-path*] [--read-only] *image-spec* | *snap-spec*
+  Maps the specified image to a block device via the rbd-nbd tool.
+
+:command:`nbd unmap` *device-path*
+  Unmaps the block device that was mapped via the rbd-nbd tool.
+
+:command:`nbd list`
+  Show the list of used nbd devices via the rbd-nbd tool.
+
 :command:`status` *image-spec*
   Show the status of the image, including which clients have it open.
 
@@ -376,10 +386,10 @@ bottleneck when individual images get large or busy.
 
 The striping is controlled by three parameters:
 
-.. option:: order
+.. option:: object-size
 
-  The size of objects we stripe over is a power of two, specifically 2^[*order*] bytes.  The default
-  is 22, or 4 MB.
+  The size of objects we stripe over is a power of two. It will be rounded up the nearest power of two.
+  The default object size is 4 MB, smallest is 4K and maximum is 32M.
 
 .. option:: stripe_unit
 
@@ -389,8 +399,8 @@ The striping is controlled by three parameters:
 .. option:: stripe_count
 
   After we write [*stripe_unit*] bytes to [*stripe_count*] objects, we loop back to the initial object
-  and write another stripe, until the object reaches its maximum size (as specified by [*order*].  At that
-  point, we move on to the next [*stripe_count*] objects.
+  and write another stripe, until the object reaches its maximum size.  At that point,
+  we move on to the next [*stripe_count*] objects.
 
 By default, [*stripe_unit*] is the same as the object size and [*stripe_count*] is 1.  Specifying a different
 [*stripe_unit*] requires that the STRIPINGV2 feature be supported (added in Ceph v0.53) and format 2 images be
@@ -461,7 +471,7 @@ To create a new rbd image that is 100 GB::
 
 To use a non-default object size (8 MB)::
 
-       rbd create mypool/myimage --size 102400 --order 23
+       rbd create mypool/myimage --size 102400 --object-size 8M
 
 To delete an rbd image (be careful!)::
 
diff --git a/man/Makefile-client.am b/man/Makefile-client.am
index 14200f5..f5e9063 100644
--- a/man/Makefile-client.am
+++ b/man/Makefile-client.am
@@ -16,6 +16,7 @@ endif
 if WITH_RBD
 dist_man_MANS += \
 	ceph-rbdnamer.8 \
+	rbd-nbd.8 \
 	rbd-replay.8 \
 	rbd-replay-many.8 \
 	rbd-replay-prep.8
diff --git a/man/Makefile.in b/man/Makefile.in
index 577ef31..f4568cb 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -96,6 +96,7 @@ DIST_COMMON = $(srcdir)/Makefile-client.am \
 
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE at am__append_3 = \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	ceph-rbdnamer.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-nbd.8 \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay.8 \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-many.8 \
 @ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-prep.8
@@ -245,6 +246,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -331,6 +333,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
diff --git a/man/ceph-authtool.8 b/man/ceph-authtool.8
index e9e09e8..90b1a42 100644
--- a/man/ceph-authtool.8
+++ b/man/ceph-authtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-AUTHTOOL" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-AUTHTOOL" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-authtool \- ceph keyring manipulation tool
 .
diff --git a/man/ceph-clsinfo.8 b/man/ceph-clsinfo.8
index 8791e10..feb5fa3 100644
--- a/man/ceph-clsinfo.8
+++ b/man/ceph-clsinfo.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CLSINFO" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-CLSINFO" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-clsinfo \- show class object information
 .
diff --git a/man/ceph-conf.8 b/man/ceph-conf.8
index e4675ae..ad73e90 100644
--- a/man/ceph-conf.8
+++ b/man/ceph-conf.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CONF" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-CONF" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-conf \- ceph conf file tool
 .
diff --git a/man/ceph-create-keys.8 b/man/ceph-create-keys.8
index 82ec005..a2a03b6 100644
--- a/man/ceph-create-keys.8
+++ b/man/ceph-create-keys.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CREATE-KEYS" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-CREATE-KEYS" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-create-keys \- ceph keyring generate tool
 .
diff --git a/man/ceph-debugpack.8 b/man/ceph-debugpack.8
index 6bb32d4..741ecf8 100644
--- a/man/ceph-debugpack.8
+++ b/man/ceph-debugpack.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEBUGPACK" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-DEBUGPACK" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-debugpack \- ceph debug packer utility
 .
diff --git a/man/ceph-dencoder.8 b/man/ceph-dencoder.8
index 0becaa1..0db2fe8 100644
--- a/man/ceph-dencoder.8
+++ b/man/ceph-dencoder.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DENCODER" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-DENCODER" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-dencoder \- ceph encoder/decoder utility
 .
diff --git a/man/ceph-deploy.8 b/man/ceph-deploy.8
index 389d2b1..be37a19 100644
--- a/man/ceph-deploy.8
+++ b/man/ceph-deploy.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEPLOY" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-DEPLOY" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-deploy \- Ceph deployment tool
 .
diff --git a/man/ceph-detect-init.8 b/man/ceph-detect-init.8
index e643f16..cba555f 100644
--- a/man/ceph-detect-init.8
+++ b/man/ceph-detect-init.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DETECT-INIT" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-DETECT-INIT" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-detect-init \- display the init system Ceph should use
 .
diff --git a/man/ceph-disk.8 b/man/ceph-disk.8
index 267237e..cac4ef2 100644
--- a/man/ceph-disk.8
+++ b/man/ceph-disk.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DISK" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-DISK" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-disk \- Ceph disk utility for OSD
 .
diff --git a/man/ceph-fuse.8 b/man/ceph-fuse.8
index cdaaa3c..db7aa5f 100644
--- a/man/ceph-fuse.8
+++ b/man/ceph-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-FUSE" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-FUSE" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-fuse \- FUSE-based client for ceph
 .
diff --git a/man/ceph-mds.8 b/man/ceph-mds.8
index 1a644e1..3409071 100644
--- a/man/ceph-mds.8
+++ b/man/ceph-mds.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MDS" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-MDS" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-mds \- ceph metadata server daemon
 .
diff --git a/man/ceph-mon.8 b/man/ceph-mon.8
index 1c67e88..96db1d5 100644
--- a/man/ceph-mon.8
+++ b/man/ceph-mon.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MON" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-MON" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-mon \- ceph monitor daemon
 .
diff --git a/man/ceph-osd.8 b/man/ceph-osd.8
index 8265af1..b71217e 100644
--- a/man/ceph-osd.8
+++ b/man/ceph-osd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-OSD" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-OSD" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-osd \- ceph object storage daemon
 .
diff --git a/man/ceph-post-file.8 b/man/ceph-post-file.8
index 7031b47..b96c348 100644
--- a/man/ceph-post-file.8
+++ b/man/ceph-post-file.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-POST-FILE" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-POST-FILE" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-post-file \- post files for ceph developers
 .
diff --git a/man/ceph-rbdnamer.8 b/man/ceph-rbdnamer.8
index b190aef..6f4e507 100644
--- a/man/ceph-rbdnamer.8
+++ b/man/ceph-rbdnamer.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RBDNAMER" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-RBDNAMER" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-rbdnamer \- udev helper to name RBD devices
 .
diff --git a/man/ceph-rest-api.8 b/man/ceph-rest-api.8
index db7f2f5..04483b7 100644
--- a/man/ceph-rest-api.8
+++ b/man/ceph-rest-api.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-REST-API" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-REST-API" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-rest-api \- ceph RESTlike administration server
 .
diff --git a/man/ceph-run.8 b/man/ceph-run.8
index a8bbd7d..9d08f59 100644
--- a/man/ceph-run.8
+++ b/man/ceph-run.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RUN" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-RUN" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-run \- restart daemon on core dump
 .
diff --git a/man/ceph-syn.8 b/man/ceph-syn.8
index b9e53d3..c544864 100644
--- a/man/ceph-syn.8
+++ b/man/ceph-syn.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-SYN" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH-SYN" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph-syn \- ceph synthetic workload generator
 .
diff --git a/man/ceph.8 b/man/ceph.8
index 4c15931..09b14a7 100644
--- a/man/ceph.8
+++ b/man/ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPH" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 ceph \- ceph administration tool
 .
diff --git a/man/cephfs.8 b/man/cephfs.8
index c0624ec..36c14d7 100644
--- a/man/cephfs.8
+++ b/man/cephfs.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPHFS" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CEPHFS" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 cephfs \- ceph file system options utility
 .
diff --git a/man/crushtool.8 b/man/crushtool.8
index a60e430..9544c49 100644
--- a/man/crushtool.8
+++ b/man/crushtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CRUSHTOOL" "8" "December 14, 2015" "dev" "Ceph"
+.TH "CRUSHTOOL" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 crushtool \- CRUSH map manipulation tool
 .
diff --git a/man/librados-config.8 b/man/librados-config.8
index fb153bb..c016749 100644
--- a/man/librados-config.8
+++ b/man/librados-config.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "LIBRADOS-CONFIG" "8" "December 14, 2015" "dev" "Ceph"
+.TH "LIBRADOS-CONFIG" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 librados-config \- display information about librados
 .
diff --git a/man/monmaptool.8 b/man/monmaptool.8
index b6a8334..36bf3e4 100644
--- a/man/monmaptool.8
+++ b/man/monmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MONMAPTOOL" "8" "December 14, 2015" "dev" "Ceph"
+.TH "MONMAPTOOL" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 monmaptool \- ceph monitor cluster map manipulation tool
 .
diff --git a/man/mount.ceph.8 b/man/mount.ceph.8
index 3a96d7a..c31ec9a 100644
--- a/man/mount.ceph.8
+++ b/man/mount.ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MOUNT.CEPH" "8" "December 14, 2015" "dev" "Ceph"
+.TH "MOUNT.CEPH" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 mount.ceph \- mount a ceph file system
 .
diff --git a/man/osdmaptool.8 b/man/osdmaptool.8
index a6708bc..0c866e2 100644
--- a/man/osdmaptool.8
+++ b/man/osdmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "OSDMAPTOOL" "8" "December 14, 2015" "dev" "Ceph"
+.TH "OSDMAPTOOL" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 osdmaptool \- ceph osd cluster map manipulation tool
 .
diff --git a/man/rados.8 b/man/rados.8
index 83fb1f0..cdf3ec0 100644
--- a/man/rados.8
+++ b/man/rados.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOS" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RADOS" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 rados \- rados object storage utility
 .
diff --git a/man/radosgw-admin.8 b/man/radosgw-admin.8
index b50171d..a9d2dcf 100644
--- a/man/radosgw-admin.8
+++ b/man/radosgw-admin.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW-ADMIN" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RADOSGW-ADMIN" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 radosgw-admin \- rados REST gateway user administration utility
 .
diff --git a/man/radosgw.8 b/man/radosgw.8
index a02e218..b7763cd 100644
--- a/man/radosgw.8
+++ b/man/radosgw.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RADOSGW" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 radosgw \- rados REST gateway
 .
diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8
index 4cab0d3..142c1f4 100644
--- a/man/rbd-fuse.8
+++ b/man/rbd-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-FUSE" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RBD-FUSE" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 rbd-fuse \- expose rbd images as files
 .
diff --git a/man/rbd-fuse.8 b/man/rbd-nbd.8
similarity index 54%
copy from man/rbd-fuse.8
copy to man/rbd-nbd.8
index 4cab0d3..b87e75f 100644
--- a/man/rbd-fuse.8
+++ b/man/rbd-nbd.8
@@ -1,8 +1,8 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-FUSE" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RBD-NBD" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
-rbd-fuse \- expose rbd images as files
+rbd-nbd \- map rbd images to nbd device
 .
 .nr rst2man-indent-level 0
 .
@@ -32,32 +32,17 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 ..
 .SH SYNOPSIS
 .nf
-\fBrbd\-fuse\fP [ \-p pool ] [\-c conffile] \fImountpoint\fP [ \fIfuse options\fP ]
+\fBrbd\-nbd\fP [\-c conf] [\-\-nbds_max \fIlimit\fP] [\-\-read\-only] [\-\-device \fInbd device\fP] map \fIimage\-spec\fP | \fIsnap\-spec\fP
+\fBrbd\-nbd\fP unmap \fInbd device\fP
+\fBrbd\-nbd\fP list\-mapped
 .fi
 .sp
 .SH DESCRIPTION
 .sp
-\fBrbd\-fuse\fP is a FUSE (File system in USErspace) client for RADOS
-block device (rbd) images.  Given a pool containing rbd images,
-it will mount a userspace filesystem allowing access to those images
-as regular files at \fBmountpoint\fP\&.
-.sp
-The file system can be unmounted with:
-.INDENT 0.0
-.INDENT 3.5
-.sp
-.nf
-.ft C
-fusermount \-u mountpoint
-.ft P
-.fi
-.UNINDENT
-.UNINDENT
-.sp
-or by sending \fBSIGINT\fP to the \fBrbd\-fuse\fP process.
+\fBrbd\-nbd\fP is a client for RADOS block device (rbd) images like rbd kernel module.
+It will map a rbd image to a nbd (Network Block Device) device, allowing access it
+as regular local block device.
 .SH OPTIONS
-.sp
-Any options not recognized by rbd\-fuse will be passed on to libfuse.
 .INDENT 0.0
 .TP
 .B \-c ceph.conf
@@ -66,16 +51,25 @@ Use \fIceph.conf\fP configuration file instead of the default
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-p pool
-Use \fIpool\fP as the pool to search for rbd images.  Default is \fBrbd\fP\&.
+.B \-\-nbds_max *limit*
+Override the parameter of NBD kernel module when modprobe, used to
+limit the count of nbd device.
 .UNINDENT
+.SH IMAGE AND SNAP SPECS
+.nf
+\fIimage\-spec\fP is [\fIpool\-name\fP]/\fIimage\-name\fP
+\fIsnap\-spec\fP  is [\fIpool\-name\fP]/\fIimage\-name\fP@\fIsnap\-name\fP
+.fi
+.sp
+.sp
+The default for \fIpool\-name\fP is "rbd".  If an image name contains a slash
+character (\(aq/\(aq), \fIpool\-name\fP is required.
 .SH AVAILABILITY
 .sp
-\fBrbd\-fuse\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
+\fBrbd\-nbd\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
-fusermount(8),
 \fBrbd\fP(8)
 .SH COPYRIGHT
 2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA
diff --git a/man/rbd-replay-many.8 b/man/rbd-replay-many.8
index 840831f..7a8cf40 100644
--- a/man/rbd-replay-many.8
+++ b/man/rbd-replay-many.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-MANY" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RBD-REPLAY-MANY" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay-many \- replay a rados block device (RBD) workload on several clients
 .
diff --git a/man/rbd-replay-prep.8 b/man/rbd-replay-prep.8
index 91e63ab..46bab70 100644
--- a/man/rbd-replay-prep.8
+++ b/man/rbd-replay-prep.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-PREP" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RBD-REPLAY-PREP" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay-prep \- prepare captured rados block device (RBD) workloads for replay
 .
diff --git a/man/rbd-replay.8 b/man/rbd-replay.8
index c007cfa..64b17dc 100644
--- a/man/rbd-replay.8
+++ b/man/rbd-replay.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RBD-REPLAY" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay \- replay rados block device (RBD) workloads
 .
diff --git a/man/rbd.8 b/man/rbd.8
index 6258df6..7b3d9c4 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD" "8" "December 14, 2015" "dev" "Ceph"
+.TH "RBD" "8" "January 13, 2016" "dev" "Ceph"
 .SH NAME
 rbd \- manage rados block device (RBD) images
 .
@@ -33,7 +33,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .SH SYNOPSIS
 .nf
 \fBrbd\fP [ \-c \fIceph.conf\fP ] [ \-m \fImonaddr\fP ] [\-\-cluster \fIcluster name\fP]
-[ \-p | \-\-pool \fIpool\fP ] [\-\-size \fIsize\fP ] [ \-\-order \fIbits\fP ] [ \fIcommand\fP ... ]
+[ \-p | \-\-pool \fIpool\fP ] [\-\-size \fIsize\fP ] [ \-\-object\-size \fIB/K/M\fP ] [ \fIcommand\fP ... ]
 .fi
 .sp
 .SH DESCRIPTION
@@ -95,9 +95,9 @@ Specifies the size (in M/G/T) of the new rbd image.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-order bits
-Specifies the object size expressed as a number of bits, such that
-the object size is \fB1 << order\fP\&. The default is 22 (4 MB).
+.B \-\-object\-size B/K/M
+Specifies the object size in B/K/M, it will be rounded up the nearest power of two.
+The default object size is 4 MB, smallest is 4K and maximum is 32M.
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -222,17 +222,17 @@ If the RBD fast\-diff feature isn\(aqt enabled on images, this operation will
 require querying the OSDs for every potential object within the image.
 .TP
 .B \fBinfo\fP \fIimage\-spec\fP | \fIsnap\-spec\fP
-Will dump information (such as size and order) about a specific rbd image.
+Will dump information (such as size and object size) about a specific rbd image.
 If image is a clone, information about its parent is also displayed.
 If a snapshot is specified, whether it is protected is shown as well.
 .TP
-.B \fBcreate\fP (\-s | \-\-size \fIsize\-in\-M/G/T\fP) [\-\-image\-format \fIformat\-id\fP] [\-\-order \fIbits\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP]... [\-\-image\-shared] \fIimage\-spec\fP
+.B \fBcreate\fP (\-s | \-\-size \fIsize\-in\-M/G/T\fP) [\-\-image\-format \fIformat\-id\fP] [\-\-object\-size \fIB/K/M\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP]... [\-\-image\-shared] \fIimage\-spec\fP
 Will create a new rbd image. You must also specify the size via \-\-size.  The
 \-\-stripe\-unit and \-\-stripe\-count arguments are optional, but must be used together.
 .TP
-.B \fBclone\fP [\-\-order \fIbits\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP] [\-\-image\-shared] \fIparent\-snap\-spec\fP \fIchild\-image\-spec\fP
+.B \fBclone\fP [\-\-object\-size \fIB/K/M\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP] [\-\-image\-shared] \fIparent\-snap\-spec\fP \fIchild\-image\-spec\fP
 Will create a clone (copy\-on\-write child) of the parent snapshot.
-Object order will be identical to that of the parent image unless
+Object size will be identical to that of the parent image unless
 specified. Size will be the same as the parent snapshot. The \-\-stripe\-unit
 and \-\-stripe\-count arguments are optional, but must be used together.
 .sp
@@ -264,11 +264,11 @@ snapshots, this fails and nothing is deleted.
 .B \fBexport\fP (\fIimage\-spec\fP | \fIsnap\-spec\fP) [\fIdest\-path\fP]
 Exports image to dest path (use \- for stdout).
 .TP
-.B \fBimport\fP [\-\-image\-format \fIformat\-id\fP] [\-\-order \fIbits\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP]... [\-\-image\-shared] \fIsrc\-path\fP [\fIimage\-spec\fP]
+.B \fBimport\fP [\-\-image\-format \fIformat\-id\fP] [\-\-object\-size \fIB/K/M\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP]... [\-\-image\-shared] \fIsrc\-path\fP [\fIimage\-spec\fP]
 Creates a new image and imports its data from path (use \- for
 stdin).  The import operation will try to create sparse rbd images
 if possible.  For import from stdin, the sparsification unit is
-the data block size of the destination image (1 << order).
+the data block size of the destination image (object size).
 .sp
 The \-\-stripe\-unit and \-\-stripe\-count arguments are optional, but must be
 used together.
@@ -303,7 +303,7 @@ whether the region is known to be zeros or may contain other data.
 .TP
 .B \fBcp\fP (\fIsrc\-image\-spec\fP | \fIsrc\-snap\-spec\fP) \fIdest\-image\-spec\fP
 Copies the content of a src\-image into the newly created dest\-image.
-dest\-image will have the same size, order, and image format as src\-image.
+dest\-image will have the same size, object size, and image format as src\-image.
 .TP
 .B \fBmv\fP \fIsrc\-image\-spec\fP \fIdest\-image\-spec\fP
 Renames an image.  Note: rename across pools is not supported.
@@ -366,6 +366,15 @@ Unmaps the block device that was mapped via the rbd kernel module.
 .B \fBshowmapped\fP
 Show the rbd images that are mapped via the rbd kernel module.
 .TP
+.B \fBnbd map\fP [\-\-device \fIdevice\-path\fP] [\-\-read\-only] \fIimage\-spec\fP | \fIsnap\-spec\fP
+Maps the specified image to a block device via the rbd\-nbd tool.
+.TP
+.B \fBnbd unmap\fP \fIdevice\-path\fP
+Unmaps the block device that was mapped via the rbd\-nbd tool.
+.TP
+.B \fBnbd list\fP
+Show the list of used nbd devices via the rbd\-nbd tool.
+.TP
 .B \fBstatus\fP \fIimage\-spec\fP
 Show the status of the image, including which clients have it open.
 .TP
@@ -421,9 +430,9 @@ bottleneck when individual images get large or busy.
 The striping is controlled by three parameters:
 .INDENT 0.0
 .TP
-.B order
-The size of objects we stripe over is a power of two, specifically 2^[\fIorder\fP] bytes.  The default
-is 22, or 4 MB.
+.B object\-size
+The size of objects we stripe over is a power of two. It will be rounded up the nearest power of two.
+The default object size is 4 MB, smallest is 4K and maximum is 32M.
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -435,8 +444,8 @@ to the next object.
 .TP
 .B stripe_count
 After we write [\fIstripe_unit\fP] bytes to [\fIstripe_count\fP] objects, we loop back to the initial object
-and write another stripe, until the object reaches its maximum size (as specified by [\fIorder\fP].  At that
-point, we move on to the next [\fIstripe_count\fP] objects.
+and write another stripe, until the object reaches its maximum size.  At that point,
+we move on to the next [\fIstripe_count\fP] objects.
 .UNINDENT
 .sp
 By default, [\fIstripe_unit\fP] is the same as the object size and [\fIstripe_count\fP] is 1.  Specifying a different
@@ -518,7 +527,7 @@ To use a non\-default object size (8 MB):
 .sp
 .nf
 .ft C
-rbd create mypool/myimage \-\-size 102400 \-\-order 23
+rbd create mypool/myimage \-\-size 102400 \-\-object\-size 8M
 .ft P
 .fi
 .UNINDENT
diff --git a/selinux/Makefile.in b/selinux/Makefile.in
index 36a629e..e628a89 100644
--- a/selinux/Makefile.in
+++ b/selinux/Makefile.in
@@ -156,6 +156,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -242,6 +243,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
diff --git a/src/.git_version b/src/.git_version
index f39fca9..3cc1b6c 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-9180a926a4450179534bc419d306f423670174c9
-v10.0.1
+86764eaebe1eda943c59d7d784b893ec8b0c6ff9
+v10.0.2
diff --git a/src/Makefile-client.am b/src/Makefile-client.am
index ff7638b..aff21ae 100644
--- a/src/Makefile-client.am
+++ b/src/Makefile-client.am
@@ -50,8 +50,6 @@ bin_SCRIPTS += \
 	rbd-replay-many \
         rbdmap
 
-python_PYTHON += pybind/rbd.py
-
 libkrbd_la_SOURCES = krbd.cc
 libkrbd_la_LIBADD = $(LIBSECRET) $(LIBCOMMON) -lblkid -ludev
 if LINUX
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index e3b6935..3d8a252 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -57,6 +57,7 @@ HARDENING_CFLAGS = \
                    -g \
                    -pipe \
                    -Wall \
+                   -Wp,-U_FORTIFY_SOURCE \
                    -Wp,-D_FORTIFY_SOURCE=2 \
                    -fexceptions \
                    --param=ssp-buffer-size=4 \
@@ -118,28 +119,30 @@ AM_COMMON_CFLAGS = \
 	-fno-strict-aliasing \
 	-fsigned-char
 if !CLANG
-	AM_COMMON_CFLAGS += -rdynamic
+	AM_COMMON_CFLAGS += ${RDYNAMIC_FLAG}
 endif
 if SOLARIS
 	AM_COMMON_CFLAGS += -Wno-unused-local-typedefs
 endif
 
-AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS)
+AM_CFLAGS = $(AM_COMMON_CFLAGS)
+if LINUX
+AM_CFLAGS += $(HARDENING_CFLAGS)
+endif
 AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 AM_CXXFLAGS = \
 	@AM_CXXFLAGS@ \
 	$(AM_COMMON_CFLAGS) \
 	-ftemplate-depth-1024 \
 	-Wnon-virtual-dtor \
-	-Wno-invalid-offsetof $(HARDENING_CFLAGS)
+	-Wno-invalid-offsetof 
+if LINUX
+AM_CXXFLAGS += $(HARDENING_CFLAGS) 
+endif
 if !CLANG
 	AM_CXXFLAGS += -Wstrict-null-sentinel
 endif
 
-# solaris harding
-if SOLARIS
-	AM_CXXFLAGS += -lssp_nonshared
-endif
 
 # note: this is position dependant, it affects the -l options that
 # come after it on the command line. when you use ${AM_LDFLAGS} in
@@ -153,6 +156,9 @@ AM_LDFLAGS =
 if LINUX
 AM_LDFLAGS += -Wl,--as-needed $(HARDENING_LDFLAGS)
 endif
+if AIX
+AM_LDFLAGS += -Wl,-brtl 
+endif
 
 if USE_BOOST_SPIRIT_OLD_HDR
 AM_CXXFLAGS += -DUSE_BOOST_SPIRIT_OLD_HDR
diff --git a/src/Makefile.am b/src/Makefile.am
index 8085dce..cd24915 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -3,6 +3,11 @@ include Makefile-env.am
 SUBDIRS += ocf java
 DIST_SUBDIRS += gmock ocf java
 
+LOCAL_ALL =
+LOCAL_CLEAN =
+LOCAL_INSTALLDATA =
+LOCAL_INSTALLEXEC =
+
 if NO_GIT_VERSION
 export NO_VERSION="yes"
 endif
@@ -43,6 +48,7 @@ include tools/Makefile.am
 include Makefile-rocksdb.am
 include compressor/Makefile.am
 include tracing/Makefile.am
+include pybind/Makefile.am
 
 
 # shell scripts
@@ -192,11 +198,12 @@ CLEANFILES += ceph_ver.h sample.fetch_config
 
 # cleaning
 
-clean-local::
+base-clean-local::
 	rm -f *.so 
 	find . -name '*.gcno' -o -name '*.gcda' -o -name '*.lcov' -o -name "*.o" -o -name "*.lo" | xargs rm -f
 	rm -f ceph java/java/com/ceph/crush/Bucket.class
 
+LOCAL_CLEAN += base-clean-local
 
 # pybind
 
@@ -242,11 +249,13 @@ if ENABLE_COVERAGE
 	-test/coverage.sh -d $(srcdir) -o check-coverage make check
 endif
 
-install-data-local:: install-coverage
+base-install-data-local:: install-coverage
 	-mkdir -p $(DESTDIR)$(sysconfdir)/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/log/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
 
+LOCAL_INSTALLDATA += base-install-data-local
+
 uninstall-local:: uninstall-coverage
 	-rmdir -p $(DESTDIR)$(sysconfdir)/ceph/
 	-rmdir -p $(DESTDIR)$(localstatedir)/log/ceph
@@ -277,3 +286,9 @@ if ENABLE_SERVER
 include Makefile-server.am
 endif
 
+# local targets
+
+all-local: $(LOCAL_ALL)
+clean-local: $(LOCAL_CLEAN)
+install-exec-local: $(LOCAL_INSTALLEXEC)
+install-data-local: $(LOCAL_INSTALLDATA)
diff --git a/src/Makefile.in b/src/Makefile.in
index ad24a20..0f0ab7e 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -116,25 +116,26 @@ DIST_COMMON = $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am \
 	$(srcdir)/tools/Makefile-client.am \
 	$(srcdir)/tools/Makefile-server.am \
 	$(srcdir)/Makefile-rocksdb.am $(srcdir)/compressor/Makefile.am \
-	$(srcdir)/tracing/Makefile.am $(srcdir)/Makefile-client.am \
-	$(srcdir)/Makefile-server.am $(srcdir)/Makefile.in \
-	$(srcdir)/Makefile.am $(srcdir)/acconfig.h.in \
-	$(dist_bin_SCRIPTS) $(top_srcdir)/depcomp \
-	$(am__python_PYTHON_DIST) $(top_srcdir)/py-compile \
-	$(dist_noinst_DATA) $(am__noinst_HEADERS_DIST) \
-	$(top_srcdir)/test-driver README TODO
-bin_PROGRAMS = $(am__EXEEXT_27) $(am__EXEEXT_28) $(am__EXEEXT_29) \
-	$(am__EXEEXT_30) $(am__EXEEXT_31) $(am__EXEEXT_32) \
-	$(am__EXEEXT_33) $(am__EXEEXT_34) $(am__EXEEXT_35) \
+	$(srcdir)/tracing/Makefile.am $(srcdir)/pybind/Makefile.am \
+	$(srcdir)/Makefile-client.am $(srcdir)/Makefile-server.am \
+	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(srcdir)/acconfig.h.in $(dist_bin_SCRIPTS) \
+	$(top_srcdir)/depcomp $(am__python_PYTHON_DIST) \
+	$(top_srcdir)/py-compile $(dist_noinst_DATA) \
+	$(am__noinst_HEADERS_DIST) $(top_srcdir)/test-driver README \
+	TODO
+bin_PROGRAMS = $(am__EXEEXT_28) $(am__EXEEXT_29) $(am__EXEEXT_30) \
+	$(am__EXEEXT_31) $(am__EXEEXT_32) $(am__EXEEXT_33) \
+	$(am__EXEEXT_34) $(am__EXEEXT_35) $(am__EXEEXT_36) \
 	monmaptool$(EXEEXT) crushtool$(EXEEXT) osdmaptool$(EXEEXT) \
-	ceph-conf$(EXEEXT) ceph-authtool$(EXEEXT) $(am__EXEEXT_36) \
-	$(am__EXEEXT_37) $(am__EXEEXT_38) $(am__EXEEXT_39) \
-	$(am__EXEEXT_40) $(am__EXEEXT_41) $(am__EXEEXT_42) \
-	$(am__EXEEXT_43)
-noinst_PROGRAMS = $(am__EXEEXT_59) $(am__EXEEXT_60) $(am__EXEEXT_61)
+	ceph-conf$(EXEEXT) ceph-authtool$(EXEEXT) $(am__EXEEXT_37) \
+	$(am__EXEEXT_38) $(am__EXEEXT_39) $(am__EXEEXT_40) \
+	$(am__EXEEXT_41) $(am__EXEEXT_42) $(am__EXEEXT_43) \
+	$(am__EXEEXT_44)
+noinst_PROGRAMS = $(am__EXEEXT_60) $(am__EXEEXT_61) $(am__EXEEXT_62)
 sbin_PROGRAMS =
-su_sbin_PROGRAMS = $(am__EXEEXT_62)
-check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
+su_sbin_PROGRAMS = $(am__EXEEXT_63)
+check_PROGRAMS = $(am__EXEEXT_58) $(am__EXEEXT_59) \
 	unittest_subprocess$(EXEEXT) \
 	unittest_async_compressor$(EXEEXT)
 
@@ -153,47 +154,50 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @SOLARIS_TRUE@       -D_PTHREADS \
 @SOLARIS_TRUE@       -D_POSIX_C_SOURCE
 
- at LINUX_TRUE@am__append_6 = -Wl,--as-needed $(HARDENING_LDFLAGS)
- at USE_BOOST_SPIRIT_OLD_HDR_TRUE@am__append_7 = -DUSE_BOOST_SPIRIT_OLD_HDR
- at WITH_LIBATOMIC_TRUE@am__append_8 = -latomic_ops
- at ENABLE_COVERAGE_TRUE@am__append_9 = -fprofile-arcs -ftest-coverage
- at ENABLE_COVERAGE_TRUE@am__append_10 = -fprofile-arcs -ftest-coverage -O0
- at FREEBSD_TRUE@am__append_11 = -lexecinfo
- at LINUX_TRUE@am__append_12 = -lrt
- at WITH_PROFILER_TRUE@am__append_13 = -lprofiler
- at WITH_LIBAIO_TRUE@am__append_14 = -laio
- at WITH_LIBZFS_TRUE@am__append_15 = libos_zfs.a -lzfs
- at WITH_TCMALLOC_MINIMAL_TRUE@am__append_16 = -ltcmalloc_minimal
- at WITH_TCMALLOC_TRUE@am__append_17 = -ltcmalloc
- at WITH_JEMALLOC_TRUE@am__append_18 = -ljemalloc
- at WITH_JEMALLOC_TRUE@am__append_19 = -ljemalloc
- at WITH_JEMALLOC_TRUE@am__append_20 = -ljemalloc
+ at LINUX_TRUE@am__append_6 = $(HARDENING_CFLAGS)
+ at LINUX_TRUE@am__append_7 = $(HARDENING_CFLAGS) 
+ at LINUX_TRUE@am__append_8 = -Wl,--as-needed $(HARDENING_LDFLAGS)
+ at AIX_TRUE@am__append_9 = -Wl,-brtl 
+ at USE_BOOST_SPIRIT_OLD_HDR_TRUE@am__append_10 = -DUSE_BOOST_SPIRIT_OLD_HDR
+ at WITH_LIBATOMIC_TRUE@am__append_11 = -latomic_ops
+ at ENABLE_COVERAGE_TRUE@am__append_12 = -fprofile-arcs -ftest-coverage
+ at ENABLE_COVERAGE_TRUE@am__append_13 = -fprofile-arcs -ftest-coverage -O0
+ at FREEBSD_TRUE@am__append_14 = -lexecinfo
+ at LINUX_TRUE@am__append_15 = -lrt
+ at WITH_PROFILER_TRUE@am__append_16 = -lprofiler
+ at WITH_LIBAIO_TRUE@am__append_17 = -laio
+ at WITH_LIBZFS_TRUE@am__append_18 = libos_zfs.a -lzfs
+ at WITH_TCMALLOC_MINIMAL_TRUE@am__append_19 = -ltcmalloc_minimal
+ at WITH_TCMALLOC_TRUE@am__append_20 = -ltcmalloc
 @WITH_JEMALLOC_TRUE at am__append_21 = -ljemalloc
- at ENABLE_COVERAGE_TRUE@am__append_22 = -lgcov
+ at WITH_JEMALLOC_TRUE@am__append_22 = -ljemalloc
+ at WITH_JEMALLOC_TRUE@am__append_23 = -ljemalloc
+ at WITH_JEMALLOC_TRUE@am__append_24 = -ljemalloc
+ at ENABLE_COVERAGE_TRUE@am__append_25 = -lgcov
 
 # libkv/libos linking order is ornery
- at WITH_SLIBROCKSDB_TRUE@am__append_23 = rocksdb/librocksdb.a
- at ENABLE_CLIENT_TRUE@am__append_24 = brag/client/ceph-brag ceph \
+ at WITH_SLIBROCKSDB_TRUE@am__append_26 = rocksdb/librocksdb.a
+ at ENABLE_CLIENT_TRUE@am__append_27 = brag/client/ceph-brag ceph \
 @ENABLE_CLIENT_TRUE@	ceph-post-file
- at ENABLE_CLIENT_TRUE@am__append_25 = brag/server brag/README.md brag/client
- at ENABLE_SERVER_TRUE@am__append_26 = libkv.a
- at ENABLE_SERVER_TRUE@am__append_27 = \
+ at ENABLE_CLIENT_TRUE@am__append_28 = brag/server brag/README.md brag/client
+ at ENABLE_SERVER_TRUE@am__append_29 = libkv.a
+ at ENABLE_SERVER_TRUE@am__append_30 = \
 @ENABLE_SERVER_TRUE@	kv/KeyValueDB.h \
 @ENABLE_SERVER_TRUE@	kv/LevelDBStore.h
 
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_28 = -I rocksdb/include -fPIC
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_29 = kv/RocksDBStore.cc
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_30 = rocksdb/librocksdb.a
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_31 = kv/RocksDBStore.h
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_32 = kv/RocksDBStore.cc
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_33 = -lrocksdb
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_34 = kv/RocksDBStore.h
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_35 = kv/KineticStore.cc
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_36 = -std=gnu++11
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_37 = -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_38 = kv/KineticStore.h
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_39 = libmon.a
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_40 = \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_31 = -I rocksdb/include -fPIC
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_32 = kv/RocksDBStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_33 = rocksdb/librocksdb.a
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_34 = kv/RocksDBStore.h
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_35 = kv/RocksDBStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_36 = -lrocksdb
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_37 = kv/RocksDBStore.h
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_38 = kv/KineticStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_39 = -std=gnu++11
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_40 = -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_41 = kv/KineticStore.h
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_42 = libmon.a
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_43 = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/AuthMonitor.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Elector.h \
@@ -222,10 +226,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 
 
 # There are no libmds_types so use the full mds library for dencoder for now
- at ENABLE_CLIENT_TRUE@am__append_41 = $(LIBMDS_SOURCES)
- at ENABLE_CLIENT_TRUE@am__append_42 = $(LIBMDS_DEPS)
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_43 = libmds.la
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_44 =  \
+ at ENABLE_CLIENT_TRUE@am__append_44 = $(LIBMDS_SOURCES)
+ at ENABLE_CLIENT_TRUE@am__append_45 = $(LIBMDS_DEPS)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_46 = libmds.la
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_47 =  \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/inode_backtrace.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/flock.h mds/locks.c \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/locks.h mds/CDentry.h \
@@ -279,17 +283,17 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/ETableClient.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/ETableServer.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/EUpdate.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_45 = os/BtrfsFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_46 = os/newstore/newstore_types.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_47 = os/newstore/NewStore.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__append_48 = \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_48 = os/BtrfsFileStoreBackend.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_49 = os/newstore/newstore_types.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_50 = os/newstore/NewStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__append_51 = \
 @ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@    os/fs/XFS.cc \
 @ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@    os/XfsFileStoreBackend.cc
 
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_49 = os/ZFSFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE at am__append_50 = $(LIBOS_TP)
- at ENABLE_SERVER_TRUE@am__append_51 = libos.a
- at ENABLE_SERVER_TRUE@am__append_52 = \
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_52 = os/ZFSFileStoreBackend.cc
+ at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE at am__append_53 = $(LIBOS_TP)
+ at ENABLE_SERVER_TRUE@am__append_54 = libos.a
+ at ENABLE_SERVER_TRUE@am__append_55 = \
 @ENABLE_SERVER_TRUE@	os/btrfs_ioctl.h \
 @ENABLE_SERVER_TRUE@	os/chain_xattr.h \
 @ENABLE_SERVER_TRUE@	os/newstore/newstore_types.h \
@@ -319,10 +323,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@	os/XfsFileStoreBackend.h \
 @ENABLE_SERVER_TRUE@	os/ZFSFileStoreBackend.h
 
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_53 = libos_zfs.a
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_54 = os/ZFS.h
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_55 = libosd.a
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_56 = \
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_56 = libos_zfs.a
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_57 = os/ZFS.h
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_58 = libosd.a
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_59 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ClassHandler.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/HitSet.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/OSD.h \
@@ -344,26 +348,26 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/Watch.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/osd_types.h
 
- at LINUX_TRUE@am__append_57 = -export-symbols-regex '.*__erasure_code_.*'
- at LINUX_TRUE@am__append_58 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_NEON_TRUE@am__append_59 = libec_jerasure_neon.la
 @LINUX_TRUE at am__append_60 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSSE3_TRUE@am__append_61 = libec_jerasure_sse3.la
- at LINUX_TRUE@am__append_62 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSE4_PCLMUL_TRUE@am__append_63 = libec_jerasure_sse4.la
- at LINUX_TRUE@am__append_64 = -export-symbols-regex '.*__erasure_code_.*'
+ at LINUX_TRUE@am__append_61 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_NEON_TRUE@am__append_62 = libec_jerasure_neon.la
+ at LINUX_TRUE@am__append_63 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSSE3_TRUE@am__append_64 = libec_jerasure_sse3.la
 @LINUX_TRUE at am__append_65 = -export-symbols-regex '.*__erasure_code_.*'
- at LINUX_TRUE@am__append_66 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSE4_PCLMUL_TRUE@am__append_66 = libec_jerasure_sse4.la
 @LINUX_TRUE at am__append_67 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_NEON_TRUE@am__append_68 = libec_shec_neon.la
+ at LINUX_TRUE@am__append_68 = -export-symbols-regex '.*__erasure_code_.*'
 @LINUX_TRUE at am__append_69 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSSE3_TRUE@am__append_70 = libec_shec_sse3.la
- at LINUX_TRUE@am__append_71 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSE4_PCLMUL_TRUE@am__append_72 = libec_shec_sse4.la
- at LINUX_TRUE@am__append_73 = -export-symbols-regex '.*__erasure_code_.*'
+ at LINUX_TRUE@am__append_70 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_NEON_TRUE@am__append_71 = libec_shec_neon.la
+ at LINUX_TRUE@am__append_72 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSSE3_TRUE@am__append_73 = libec_shec_sse3.la
+ at LINUX_TRUE@am__append_74 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSE4_PCLMUL_TRUE@am__append_75 = libec_shec_sse4.la
+ at LINUX_TRUE@am__append_76 = -export-symbols-regex '.*__erasure_code_.*'
 
 # ISA
- at WITH_BETTER_YASM_ELF64_TRUE@am__append_74 = \
+ at WITH_BETTER_YASM_ELF64_TRUE@am__append_77 = \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsa.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsaTableCache.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/xor_op.h \
@@ -374,10 +378,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/include/gf_vect_mul.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/include/types.h
 
- at LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE at am__append_75 = -export-symbols-regex '.*__erasure_code_.*'
- at WITH_BETTER_YASM_ELF64_TRUE@am__append_76 = libec_isa.la
- at ENABLE_CLIENT_TRUE@am__append_77 = libclient.la
- at ENABLE_CLIENT_TRUE@am__append_78 = \
+ at LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE at am__append_78 = -export-symbols-regex '.*__erasure_code_.*'
+ at WITH_BETTER_YASM_ELF64_TRUE@am__append_79 = libec_isa.la
+ at ENABLE_CLIENT_TRUE@am__append_80 = libclient.la
+ at ENABLE_CLIENT_TRUE@am__append_81 = \
 @ENABLE_CLIENT_TRUE@	client/Client.h \
 @ENABLE_CLIENT_TRUE@	client/Dentry.h \
 @ENABLE_CLIENT_TRUE@	client/Dir.h \
@@ -392,52 +396,57 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@	client/ioctl.h \
 @ENABLE_CLIENT_TRUE@	client/ObjecterWriteback.h
 
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_79 = libclient_fuse.la
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_80 = client/fuse_ll.h
- at ENABLE_CLIENT_TRUE@am__append_81 = ceph_test_ioctls
- at WITH_TCMALLOC_TRUE@am__append_82 = perfglue/heap_profiler.cc
- at WITH_TCMALLOC_TRUE@am__append_83 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_TRUE@am__append_84 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_85 = perfglue/heap_profiler.cc
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_86 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_87 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__append_88 = perfglue/disabled_heap_profiler.cc
- at WITH_PROFILER_TRUE@am__append_89 = perfglue/cpu_profiler.cc
- at WITH_PROFILER_FALSE@am__append_90 = perfglue/disabled_stubs.cc
- at ENABLE_SERVER_TRUE@am__append_91 = \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_82 = libclient_fuse.la
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_83 = client/fuse_ll.h
+ at ENABLE_CLIENT_TRUE@am__append_84 = ceph_test_ioctls
+ at WITH_TCMALLOC_TRUE@am__append_85 = perfglue/heap_profiler.cc
+ at WITH_TCMALLOC_TRUE@am__append_86 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_TRUE@am__append_87 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_88 = perfglue/heap_profiler.cc
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_89 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_90 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__append_91 = perfglue/disabled_heap_profiler.cc
+ at WITH_PROFILER_TRUE@am__append_92 = perfglue/cpu_profiler.cc
+ at WITH_PROFILER_FALSE@am__append_93 = perfglue/disabled_stubs.cc
+ at ENABLE_SERVER_TRUE@am__append_94 = \
 @ENABLE_SERVER_TRUE@	common/xattr.c \
 @ENABLE_SERVER_TRUE@	common/ipaddr.cc \
 @ENABLE_SERVER_TRUE@	common/ceph_json.cc \
 @ENABLE_SERVER_TRUE@	common/util.cc \
 @ENABLE_SERVER_TRUE@	common/pick_address.cc
 
- at LINUX_TRUE@am__append_92 = \
+ at LINUX_TRUE@am__append_95 = \
 @LINUX_TRUE@	common/linux_version.c 
 
- at SOLARIS_TRUE@am__append_93 = \
+ at SOLARIS_TRUE@am__append_96 = \
 @SOLARIS_TRUE@        common/solaris_errno.cc
 
- at LINUX_TRUE@@WITH_RBD_TRUE at am__append_94 = \
- at LINUX_TRUE@@WITH_RBD_TRUE@	common/blkdev.cc
+ at AIX_TRUE@am__append_97 = \
+ at AIX_TRUE@        common/aix_errno.cc
 
- at ENABLE_XIO_TRUE@am__append_95 = \
+
+# used by RBD and FileStore
+ at LINUX_TRUE@am__append_98 = \
+ at LINUX_TRUE@	common/blkdev.cc
+
+ at ENABLE_XIO_TRUE@am__append_99 = \
 @ENABLE_XIO_TRUE@	common/address_helper.cc
 
- at WITH_GOOD_YASM_ELF64_TRUE@am__append_96 = common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
- at HAVE_ARMV8_CRC_TRUE@am__append_97 = libcommon_crc_aarch64.la
- at HAVE_ARMV8_CRC_TRUE@am__append_98 = libcommon_crc_aarch64.la
- at LINUX_TRUE@am__append_99 = -lrt -lblkid
- at ENABLE_XIO_TRUE@am__append_100 = \
+ at WITH_GOOD_YASM_ELF64_TRUE@am__append_100 = common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
+ at HAVE_ARMV8_CRC_TRUE@am__append_101 = libcommon_crc_aarch64.la
+ at HAVE_ARMV8_CRC_TRUE@am__append_102 = libcommon_crc_aarch64.la
+ at LINUX_TRUE@am__append_103 = -lrt -lblkid
+ at ENABLE_XIO_TRUE@am__append_104 = \
 @ENABLE_XIO_TRUE@	common/address_helper.h
 
- at LINUX_TRUE@am__append_101 = libsecret.la
- at LINUX_TRUE@am__append_102 = msg/async/EventEpoll.cc
- at DARWIN_TRUE@am__append_103 = msg/async/EventKqueue.cc
- at FREEBSD_TRUE@am__append_104 = msg/async/EventKqueue.cc
- at LINUX_TRUE@am__append_105 = msg/async/EventEpoll.h
- at DARWIN_TRUE@am__append_106 = msg/async/EventKqueue.h
- at FREEBSD_TRUE@am__append_107 = msg/async/EventKqueue.h
- at ENABLE_XIO_TRUE@am__append_108 = \
+ at LINUX_TRUE@am__append_105 = libsecret.la
+ at LINUX_TRUE@am__append_106 = msg/async/EventEpoll.cc
+ at DARWIN_TRUE@am__append_107 = msg/async/EventKqueue.cc
+ at FREEBSD_TRUE@am__append_108 = msg/async/EventKqueue.cc
+ at LINUX_TRUE@am__append_109 = msg/async/EventEpoll.h
+ at DARWIN_TRUE@am__append_110 = msg/async/EventKqueue.h
+ at FREEBSD_TRUE@am__append_111 = msg/async/EventKqueue.h
+ at ENABLE_XIO_TRUE@am__append_112 = \
 @ENABLE_XIO_TRUE@	msg/xio/QueueStrategy.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioConnection.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioMessenger.cc \
@@ -445,7 +454,7 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioPool.cc
 
- at ENABLE_XIO_TRUE@am__append_109 = \
+ at ENABLE_XIO_TRUE@am__append_113 = \
 @ENABLE_XIO_TRUE@	msg/xio/DispatchStrategy.h \
 @ENABLE_XIO_TRUE@	msg/xio/FastStrategy.h \
 @ENABLE_XIO_TRUE@	msg/xio/QueueStrategy.h \
@@ -457,18 +466,18 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.h \
 @ENABLE_XIO_TRUE@	msg/xio/XioSubmit.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_110 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_114 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_api.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libjournal.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_111 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_115 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) $(LIBCOMMON_DEPS)
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_112 = -fvisibility=hidden -fvisibility-inlines-hidden
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_113 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_114 = librados.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_115 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_116 = -fvisibility=hidden -fvisibility-inlines-hidden
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_117 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_118 = librados.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_119 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/snap_set_diff.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/AioCompletionImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/IoCtxImpl.h \
@@ -477,13 +486,13 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/RadosXattrIter.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/ListObjectImpl.h
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_116 = -export-symbols-regex '^radosstriper_.*'
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_117 = libradosstriper.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_118 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_120 = -export-symbols-regex '^radosstriper_.*'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_121 = libradosstriper.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_122 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/RadosStriperImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/MultiAioCompletionImpl.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_119 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_123 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/AsyncOpTracker.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Entry.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Future.h \
@@ -499,25 +508,24 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ReplayHandler.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Utils.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_120 = libjournal.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_121 = librbd_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_124 = libjournal.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_125 = librbd_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_122 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_123 = librbd.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_124 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_126 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_127 = librbd.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_128 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioCompletion.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequestWQ.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioObjectRequest.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncObjectThrottle.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncOperation.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncRequest.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncResizeRequest.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncTrimRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/CopyupRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/DiffIterate.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ExclusiveLock.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageState.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.h \
@@ -527,23 +535,53 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/parent_types.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/SnapInfo.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/TaskFinisher.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/WatchNotifyTypes.h
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Utils.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/WatchNotifyTypes.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/AcquireRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/ReleaseRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/CloseRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/OpenRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshParentRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/SetSnapRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/InvalidateRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/LockRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/Request.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/RefreshRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/ResizeRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotCreateRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotRemoveRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotRollbackRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/UnlockRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/UpdateRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/FlattenRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/RebuildObjectMapRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/RenameRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/Request.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/ResizeRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotCreateRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotProtectRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRemoveRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRenameRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRollbackRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotUnprotectRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/TrimRequest.h
 
 
 # inject rgw stuff in the decoder testcase
- at ENABLE_CLIENT_TRUE@am__append_125 = \
+ at ENABLE_CLIENT_TRUE@am__append_129 = \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_dencoder.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_acl.cc \
+ at ENABLE_CLIENT_TRUE@	rgw/rgw_basic_types.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_common.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_env.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_json_enc.cc
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_126 = librgw.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_130 = librgw.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcivetweb.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_127 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_131 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
@@ -560,18 +598,19 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lfcgi \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-ldl
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_128 = radosgw \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_132 = radosgw \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-object-expirer
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_129 = ceph_rgw_multiparser \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_133 = ceph_rgw_multiparser \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_rgw_jsonparser
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_130 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_134 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl_s3.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl_swift.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_client_io.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_fcgi.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_xml.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_basic_types.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_cache.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_common.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_cors.h \
@@ -624,31 +663,33 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/include/civetweb_conf.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/src/md5.h
 
- at ENABLE_CLIENT_TRUE@am__append_131 = libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@am__append_135 = libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_cephfs_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_numops_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_journal_client.la
- at ENABLE_CLIENT_TRUE@am__append_132 = libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@am__append_136 = libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
- at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_user_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_rbd_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_user_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_numops_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_journal_client.la
- at ENABLE_CLIENT_TRUE@am__append_133 = libcls_version_client.a \
+ at ENABLE_CLIENT_TRUE@am__append_137 = libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_statelog_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_user_client.a
- at ENABLE_CLIENT_TRUE@am__append_134 = \
+ at ENABLE_CLIENT_TRUE@am__append_138 = \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_types.h \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_client.h \
 @ENABLE_CLIENT_TRUE@	cls/numops/cls_numops_client.h \
 @ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd.h \
 @ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_client.h \
+ at ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_types.h \
 @ENABLE_CLIENT_TRUE@	cls/refcount/cls_refcount_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/refcount/cls_refcount_client.h \
 @ENABLE_CLIENT_TRUE@	cls/version/cls_version_types.h \
@@ -677,7 +718,7 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_client.h \
 @ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_types.h
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_135 = libcls_hello.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_139 = libcls_hello.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_numops.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rbd.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_lock.la \
@@ -691,13 +732,13 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rgw.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_cephfs.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_journal.la
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_136 = libcls_kvs.la
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_137 = \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_140 = libcls_kvs.la
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_141 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/key_value_structure.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/kv_flat_btree_async.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/kvs_arg_types.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_138 = rbd_replay/ActionTypes.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_142 = rbd_replay/ActionTypes.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/actions.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BoundedBuffer.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BufferReader.h \
@@ -707,26 +748,26 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_loc.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_replay_debug.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.hpp
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_139 = librbd_replay_types.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_143 = librbd_replay_types.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_140 = librbd_replay_types.la
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_141 = rbd-replay
- at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_142 = rbd-replay-prep
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_143 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_144 = librbd_replay_types.la
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_145 = rbd-replay
+ at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_146 = rbd-replay-prep
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_147 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-code.sh \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-eio.sh
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_144 = test/erasure-code/ceph_erasure_code_benchmark.h \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_148 = test/erasure-code/ceph_erasure_code_benchmark.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code_benchmark.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ErasureCodeExample.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_145 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_146 = ceph_erasure_code_benchmark \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph_erasure_code
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_147 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_148 = ceph_erasure_code_non_regression
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_149 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_150 = libec_example.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_150 = ceph_erasure_code_benchmark \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph_erasure_code
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_151 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_152 = ceph_erasure_code_non_regression
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_153 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_154 = libec_example.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_missing_entry_point.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_missing_version.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_hangs.la \
@@ -740,19 +781,19 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_sse4.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_sse3.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_generic.la
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_151 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_152 = unittest_erasure_code_plugin \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_155 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_156 = unittest_erasure_code_plugin \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_jerasure \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_jerasure
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_153 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_154 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_155 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_156 = unittest_erasure_code_isa \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_isa
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_157 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_157 = -ldl
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_158 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_159 =  \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_159 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_160 = unittest_erasure_code_isa \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_isa
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_161 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_162 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_163 =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_lrc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_lrc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec \
@@ -761,44 +802,44 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_arguments \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_shec \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_example
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_160 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_161 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_162 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_163 = -ldl
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_164 = -ldl
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_165 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_166 = test/messenger/message_helper.h \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_166 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_167 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_168 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_169 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_170 = test/messenger/message_helper.h \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_dispatcher.h \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.h
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_167 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_168 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_169 = simple_server \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_171 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_172 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_173 = simple_server \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_client xio_server \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_client
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_170 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_171 = -ldl
- at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_172 = -fno-var-tracking-assignments
- at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_173 = -fno-var-tracking-assignments
- at ENABLE_CLIENT_TRUE@am__append_174 = ceph-dencoder
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_175 = libradostest.la \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_174 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_175 = -ldl
+ at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_176 = -fno-var-tracking-assignments
+ at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_177 = -fno-var-tracking-assignments
+ at ENABLE_CLIENT_TRUE@am__append_178 = ceph-dencoder
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_179 = libradostest.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_test_stub.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_176 = ceph_test_rados \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_180 = ceph_test_rados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_mutate
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__append_177 = test_build_librados
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_178 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__append_181 = test_build_librados
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_182 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_smalliobench \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_omapbench \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_objectstore_bench
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_179 = ceph_kvstorebench \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_183 = ceph_kvstorebench \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_list_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_open_pools_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_delete_pools_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_watch_notify
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_180 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_184 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_journal
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_181 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_185 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_multi_stress_watch \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_refcount \
@@ -826,7 +867,7 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_tier \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_lock \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_stress_watch
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_182 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_186 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/LibradosTestStub.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/MockTestMemIoCtxImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/MockTestMemRadosClient.h \
@@ -837,57 +878,62 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestMemIoCtxImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestIoCtxImpl.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_183 = ceph_smalliobenchrbd \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_187 = ceph_smalliobenchrbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_librbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_librbd_api
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_184 = unittest_rbd_replay
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_185 = librbd_test.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_186 = unittest_librbd
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_187 = test/run-rbd-unit-tests.sh
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_188 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_188 = unittest_rbd_replay
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_189 = librbd_test.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_190 = unittest_librbd
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_191 = test/run-rbd-unit-tests.sh
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_192 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_fixture.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_support.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockAioImageRequestWQ.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockContextWQ.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockExclusiveLock.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageCtx.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageWatcher.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockObjectMap.h
-
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_189 = ceph_test_librbd_fsx
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_190 = libradosstripertest.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_191 = ceph_test_rados_striper_api_io \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockJournal.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockObjectMap.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockReadahead.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/mock/MockInvalidateRequest.h
+
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_193 = ceph_test_librbd_fsx
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_194 = libradosstripertest.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_195 = ceph_test_rados_striper_api_io \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_striper_api_aio \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_striper_api_striping
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_192 = test_build_libcephfs
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_193 = unittest_encoding \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_196 = test_build_libcephfs
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_197 = unittest_encoding \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_base64 \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_run_cmd \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_simple_spin \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_libcephfs_config
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_194 = test/libcephfs/flock.cc
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_195 = ceph_test_libcephfs \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_198 = test/libcephfs/flock.cc
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_199 = ceph_test_libcephfs \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	ceph_test_c_headers
- at CLANG_FALSE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_196 = -Werror -Wold-style-declaration
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_197 = test_build_librgw
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_198 = ceph_test_cors \
+ at CLANG_FALSE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_200 = -Werror -Wold-style-declaration
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_201 = test_build_librgw
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_202 = ceph_test_cors \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_manifest \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_obj \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_meta \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_log \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_opstate \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw
- at ENABLE_SERVER_TRUE@am__append_199 = ceph_test_async_driver \
+ at ENABLE_SERVER_TRUE@am__append_203 = ceph_test_async_driver \
 @ENABLE_SERVER_TRUE@	ceph_test_msgr ceph_streamtest \
 @ENABLE_SERVER_TRUE@	ceph_test_trans ceph_test_mon_workloadgen \
 @ENABLE_SERVER_TRUE@	ceph_test_mon_msg ceph_perf_objectstore \
 @ENABLE_SERVER_TRUE@	ceph_perf_local ceph_perf_msgr_server \
 @ENABLE_SERVER_TRUE@	ceph_perf_msgr_client
- at ENABLE_SERVER_TRUE@am__append_200 = test/perf_helper.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_201 =  \
+ at ENABLE_SERVER_TRUE@am__append_204 = test/perf_helper.h
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_205 =  \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_objectstore \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_keyvaluedb \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_filestore
- at ENABLE_SERVER_TRUE@am__append_202 =  \
+ at ENABLE_SERVER_TRUE@am__append_206 =  \
 @ENABLE_SERVER_TRUE@	ceph_test_objectstore_workloadgen \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent_sequence \
@@ -895,63 +941,65 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@	ceph_test_object_map \
 @ENABLE_SERVER_TRUE@	ceph_test_keyvaluedb_atomicity \
 @ENABLE_SERVER_TRUE@	ceph_test_keyvaluedb_iterators
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am__append_203 = ceph_smalliobenchfs \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am__append_207 = ceph_smalliobenchfs \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	ceph_smalliobenchdumb \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	ceph_tpbench
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_204 = ceph_test_keys
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_205 = get_command_descriptions
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_206 =  \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_208 = ceph_test_keys
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_209 = get_command_descriptions
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_210 =  \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_moncap \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_pgmap
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_207 =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_211 =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_ecbackend \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osdscrub \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pglog \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_hitset \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pageset
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_208 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_209 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_210 = ceph_test_snap_mapper
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_211 = unittest_rocksdb_option_static
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_212 = unittest_rocksdb_option
- at ENABLE_SERVER_TRUE@am__append_213 = unittest_chain_xattr \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_212 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_213 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_214 = ceph_test_snap_mapper
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_215 = unittest_rocksdb_option_static
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_216 = unittest_rocksdb_option
+ at ENABLE_SERVER_TRUE@am__append_217 = unittest_chain_xattr \
 @ENABLE_SERVER_TRUE@	unittest_lfnindex
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_214 = unittest_mds_authcap
- at WITH_BUILD_TESTS_TRUE@am__append_215 = test_build_libcommon
- at LINUX_TRUE@am__append_216 = libsystest.la
- at SOLARIS_TRUE@am__append_217 = \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_218 = unittest_mds_authcap
+ at WITH_BUILD_TESTS_TRUE@am__append_219 = test_build_libcommon
+ at LINUX_TRUE@am__append_220 = libsystest.la
+ at SOLARIS_TRUE@am__append_221 = \
 @SOLARIS_TRUE@	-lsocket -lnsl
 
- at LINUX_TRUE@am__append_218 = unittest_blkdev
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_219 =  \
+ at LINUX_TRUE@am__append_222 = unittest_blkdev
+ at LINUX_TRUE@am__append_223 = ceph_test_get_blkdev_size
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_224 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtool \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtoolpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_radosacl
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_220 = rados
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_221 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_225 = rados
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_226 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/ArgumentTypes.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/IndentStream.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/OptionPrinter.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Shell.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Utils.h
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_222 = rbd
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_223 = ceph-client-debug
- at ENABLE_SERVER_TRUE@am__append_224 = ceph-osdomap-tool \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_227 = rbd \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd-nbd
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_228 = ceph-client-debug
+ at ENABLE_SERVER_TRUE@am__append_229 = ceph-osdomap-tool \
 @ENABLE_SERVER_TRUE@	ceph-monstore-tool ceph-kvstore-tool
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_225 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_226 = ceph-objectstore-tool
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__append_227 = cephfs-journal-tool \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_230 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_231 = ceph-objectstore-tool
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__append_232 = cephfs-journal-tool \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-data-scan
- at WITH_LTTNG_TRUE@am__append_228 = \
+ at WITH_LTTNG_TRUE@am__append_233 = \
 @WITH_LTTNG_TRUE@	libosd_tp.la \
 @WITH_LTTNG_TRUE@	libos_tp.la \
 @WITH_LTTNG_TRUE@	librados_tp.la \
 @WITH_LTTNG_TRUE@	librbd_tp.la
 
- at WITH_LTTNG_TRUE@am__append_229 = \
+ at WITH_LTTNG_TRUE@am__append_234 = \
 @WITH_LTTNG_TRUE@	tracing/librados.h \
 @WITH_LTTNG_TRUE@	tracing/librbd.h \
 @WITH_LTTNG_TRUE@	tracing/objectstore.h \
@@ -959,54 +1007,56 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @WITH_LTTNG_TRUE@	tracing/osd.h \
 @WITH_LTTNG_TRUE@	tracing/pg.h
 
-TESTS = $(am__EXEEXT_57) $(check_SCRIPTS)
- at ENABLE_CLIENT_TRUE@am__append_230 = \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_235 = pybind-all
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_236 = pybind-clean
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_237 = pybind-install-exec
+TESTS = $(am__EXEEXT_58) $(check_SCRIPTS)
+ at ENABLE_CLIENT_TRUE@am__append_238 = \
 @ENABLE_CLIENT_TRUE@	pybind/ceph_argparse.py \
 @ENABLE_CLIENT_TRUE@	pybind/ceph_daemon.py
 
- at ENABLE_CLIENT_TRUE@am__append_231 = ceph-syn
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_232 = \
+ at ENABLE_CLIENT_TRUE@am__append_239 = ceph-syn
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_240 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/bash_completion/rados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/bash_completion/radosgw-admin
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_233 = pybind/rados.py
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_234 = librados-config
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_235 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_241 = pybind/rados.py
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_242 = librados-config
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_243 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(srcdir)/bash_completion/rbd
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_236 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_244 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph-rbdnamer \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd-replay-many \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        rbdmap
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_237 = pybind/rbd.py
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_238 = libkrbd.la
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__append_239 = ceph-fuse
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_240 = rbd-fuse
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_241 = cephfs
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_242 = pybind/cephfs.py
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_243 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a'
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_244 = libcephfs.la
- at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_245 = libcephfs_jni.la
- at ENABLE_SERVER_TRUE@am__append_246 = ceph-run ceph-rest-api \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_245 = libkrbd.la
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__append_246 = ceph-fuse
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_247 = rbd-fuse
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_248 = cephfs
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_249 = pybind/cephfs.py
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_250 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a'
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_251 = libcephfs.la
+ at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_252 = libcephfs_jni.la
+ at ENABLE_SERVER_TRUE@am__append_253 = ceph-run ceph-rest-api \
 @ENABLE_SERVER_TRUE@	ceph-debugpack ceph-crush-location \
 @ENABLE_SERVER_TRUE@	ceph-coverage
- at ENABLE_SERVER_TRUE@am__append_247 = pybind/ceph_rest_api.py
- at ENABLE_SERVER_TRUE@am__append_248 = ceph-coverage init-ceph
- at ENABLE_SERVER_TRUE@am__append_249 = init-ceph
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_250 = mount.ceph
- at ENABLE_SERVER_TRUE@am__append_251 = mount.fuse.ceph
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_252 = ceph-mon
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_253 = \
+ at ENABLE_SERVER_TRUE@am__append_254 = pybind/ceph_rest_api.py
+ at ENABLE_SERVER_TRUE@am__append_255 = ceph-coverage init-ceph
+ at ENABLE_SERVER_TRUE@am__append_256 = init-ceph
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_257 = mount.ceph
+ at ENABLE_SERVER_TRUE@am__append_258 = mount.fuse.ceph
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_259 = ceph-mon
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_260 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk-udev
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_254 = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_261 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-clsinfo
 
- at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE@@WITH_OSD_TRUE at am__append_255 = $(LIBOSD_TP)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_256 = ceph-osd
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_257 = ceph-mds
+ at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE@@WITH_OSD_TRUE at am__append_262 = $(LIBOSD_TP)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_263 = ceph-osd
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_264 = ceph-mds
 subdir = src
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
@@ -1091,7 +1141,7 @@ libkv_a_AR = $(AR) $(ARFLAGS)
 am__DEPENDENCIES_1 =
 @ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__DEPENDENCIES_2 =  \
 @ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@	libcrypto.a
- at ENABLE_SERVER_TRUE@libkv_a_DEPENDENCIES = $(am__append_30) \
+ at ENABLE_SERVER_TRUE@libkv_a_DEPENDENCIES = $(am__append_33) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_2)
 am__libkv_a_SOURCES_DIST = kv/KeyValueDB.cc kv/LevelDBStore.cc \
@@ -1456,9 +1506,11 @@ libcls_numops_client_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
-am__libcls_rbd_la_SOURCES_DIST = cls/rbd/cls_rbd.cc
+am__libcls_rbd_la_SOURCES_DIST = cls/rbd/cls_rbd.cc \
+	cls/rbd/cls_rbd_types.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_rbd_la_OBJECTS =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rbd/cls_rbd.lo
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rbd/cls_rbd.lo \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rbd/cls_rbd_types.lo
 libcls_rbd_la_OBJECTS = $(am_libcls_rbd_la_OBJECTS)
 libcls_rbd_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -1467,9 +1519,11 @@ libcls_rbd_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_rbd_la_rpath = -rpath \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
 libcls_rbd_client_la_LIBADD =
-am__libcls_rbd_client_la_SOURCES_DIST = cls/rbd/cls_rbd_client.cc
+am__libcls_rbd_client_la_SOURCES_DIST = cls/rbd/cls_rbd_client.cc \
+	cls/rbd/cls_rbd_types.cc
 @ENABLE_CLIENT_TRUE at am_libcls_rbd_client_la_OBJECTS =  \
- at ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_client.lo
+ at ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_client.lo \
+ at ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_types.lo
 libcls_rbd_client_la_OBJECTS = $(am_libcls_rbd_client_la_OBJECTS)
 @ENABLE_CLIENT_TRUE at am_libcls_rbd_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_refcount_la_DEPENDENCIES =  \
@@ -1590,7 +1644,7 @@ libcls_version_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_version_la_rpath =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
 am__DEPENDENCIES_4 = libcommon_internal.la libcommon_crc.la \
-	$(am__append_97) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
+	$(am__append_101) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
 	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
 libcommon_la_DEPENDENCIES = $(am__DEPENDENCIES_4)
@@ -1648,20 +1702,22 @@ am__libcommon_internal_la_SOURCES_DIST = ceph_ver.c \
 	common/ceph_frag.cc common/addr_parsing.c common/hobject.cc \
 	common/bloom_filter.cc common/module.c common/Readahead.cc \
 	common/Cycles.cc common/ContextCompletion.cc \
-	common/TracepointProvider.cc common/xattr.c common/ipaddr.cc \
-	common/ceph_json.cc common/util.cc common/pick_address.cc \
-	common/linux_version.c common/solaris_errno.cc \
-	common/blkdev.cc common/address_helper.cc mon/MonCap.cc \
-	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
-	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
-	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
+	common/TracepointProvider.cc common/PluginRegistry.cc \
+	common/xattr.c common/ipaddr.cc common/ceph_json.cc \
+	common/util.cc common/pick_address.cc common/linux_version.c \
+	common/solaris_errno.cc common/aix_errno.cc common/blkdev.cc \
+	common/address_helper.cc mon/MonCap.cc mon/MonClient.cc \
+	mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc osd/ECMsgTypes.cc \
+	osd/HitSet.cc mds/MDSMap.cc mds/inode_backtrace.cc \
+	mds/mdstypes.cc mds/flock.cc
 @ENABLE_SERVER_TRUE at am__objects_10 = common/xattr.lo common/ipaddr.lo \
 @ENABLE_SERVER_TRUE@	common/ceph_json.lo common/util.lo \
 @ENABLE_SERVER_TRUE@	common/pick_address.lo
 @LINUX_TRUE at am__objects_11 = common/linux_version.lo
 @SOLARIS_TRUE at am__objects_12 = common/solaris_errno.lo
- at LINUX_TRUE@@WITH_RBD_TRUE at am__objects_13 = common/blkdev.lo
- at ENABLE_XIO_TRUE@am__objects_14 = common/address_helper.lo
+ at AIX_TRUE@am__objects_13 = common/aix_errno.lo
+ at LINUX_TRUE@am__objects_14 = common/blkdev.lo
+ at ENABLE_XIO_TRUE@am__objects_15 = common/address_helper.lo
 am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/LogClient.lo common/LogEntry.lo \
 	common/PrebufferedStreambuf.lo common/SloppyCRCMap.lo \
@@ -1688,12 +1744,12 @@ am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/addr_parsing.lo common/hobject.lo \
 	common/bloom_filter.lo common/module.lo common/Readahead.lo \
 	common/Cycles.lo common/ContextCompletion.lo \
-	common/TracepointProvider.lo $(am__objects_10) \
-	$(am__objects_11) $(am__objects_12) $(am__objects_13) \
-	$(am__objects_14) mon/MonCap.lo mon/MonClient.lo mon/MonMap.lo \
-	osd/OSDMap.lo osd/osd_types.lo osd/ECMsgTypes.lo osd/HitSet.lo \
-	mds/MDSMap.lo mds/inode_backtrace.lo mds/mdstypes.lo \
-	mds/flock.lo
+	common/TracepointProvider.lo common/PluginRegistry.lo \
+	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
+	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
+	mon/MonCap.lo mon/MonClient.lo mon/MonMap.lo osd/OSDMap.lo \
+	osd/osd_types.lo osd/ECMsgTypes.lo osd/HitSet.lo mds/MDSMap.lo \
+	mds/inode_backtrace.lo mds/mdstypes.lo mds/flock.lo
 libcommon_internal_la_OBJECTS = $(am_libcommon_internal_la_OBJECTS)
 libcompressor_la_DEPENDENCIES = $(LIBCOMMON)
 am_libcompressor_la_OBJECTS = compressor/Compressor.lo \
@@ -1811,7 +1867,7 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 	erasure-code/isa/ErasureCodeIsaTableCache.cc \
 	erasure-code/isa/ErasureCodePluginIsa.cc \
 	erasure-code/isa/xor_op.cc
- at WITH_BETTER_YASM_ELF64_TRUE@am__objects_15 = erasure-code/libec_isa_la-ErasureCode.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@am__objects_16 = erasure-code/libec_isa_la-ErasureCode.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_base.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_highlevel_func.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_multibinary.asm.lo \
@@ -1858,7 +1914,7 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-ErasureCodePluginIsa.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-xor_op.lo
 @WITH_BETTER_YASM_ELF64_TRUE at am_libec_isa_la_OBJECTS =  \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__objects_15)
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__objects_16)
 libec_isa_la_OBJECTS = $(am_libec_isa_la_OBJECTS)
 libec_isa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -1876,7 +1932,7 @@ libec_jerasure_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_jerasure_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_jerasure_generic_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
-am__objects_16 =  \
+am__objects_17 =  \
 	erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-galois.lo \
@@ -1896,7 +1952,7 @@ am__objects_16 =  \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_generic_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_generic_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_generic_la-ErasureCodeJerasure.lo
-am_libec_jerasure_generic_la_OBJECTS = $(am__objects_16)
+am_libec_jerasure_generic_la_OBJECTS = $(am__objects_17)
 libec_jerasure_generic_la_OBJECTS =  \
 	$(am_libec_jerasure_generic_la_OBJECTS)
 libec_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -1905,7 +1961,7 @@ libec_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_jerasure_generic_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_jerasure_neon_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
-am__objects_17 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
+am__objects_18 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-jerasure.lo \
@@ -1924,7 +1980,7 @@ am__objects_17 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_neon_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_neon_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_neon_la-ErasureCodeJerasure.lo
-am_libec_jerasure_neon_la_OBJECTS = $(am__objects_17) \
+am_libec_jerasure_neon_la_OBJECTS = $(am__objects_18) \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w4_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w8_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w16_neon.lo \
@@ -1939,7 +1995,7 @@ libec_jerasure_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_NEON_TRUE@	$(erasure_codelibdir)
 libec_jerasure_sse3_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
-am__objects_18 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
+am__objects_19 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-jerasure.lo \
@@ -1958,7 +2014,7 @@ am__objects_18 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_sse3_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_sse3_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_sse3_la-ErasureCodeJerasure.lo
-am_libec_jerasure_sse3_la_OBJECTS = $(am__objects_18)
+am_libec_jerasure_sse3_la_OBJECTS = $(am__objects_19)
 libec_jerasure_sse3_la_OBJECTS = $(am_libec_jerasure_sse3_la_OBJECTS)
 libec_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -1968,7 +2024,7 @@ libec_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSSE3_TRUE@	$(erasure_codelibdir)
 libec_jerasure_sse4_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
-am__objects_19 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
+am__objects_20 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-jerasure.lo \
@@ -1987,7 +2043,7 @@ am__objects_19 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_sse4_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_sse4_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_sse4_la-ErasureCodeJerasure.lo
-am_libec_jerasure_sse4_la_OBJECTS = $(am__objects_19)
+am_libec_jerasure_sse4_la_OBJECTS = $(am__objects_20)
 libec_jerasure_sse4_la_OBJECTS = $(am_libec_jerasure_sse4_la_OBJECTS)
 libec_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -1997,10 +2053,10 @@ libec_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSE4_PCLMUL_TRUE@	$(erasure_codelibdir)
 libec_lrc_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(LIBJSON_SPIRIT)
-am__objects_20 = erasure-code/libec_lrc_la-ErasureCode.lo \
+am__objects_21 = erasure-code/libec_lrc_la-ErasureCode.lo \
 	erasure-code/lrc/libec_lrc_la-ErasureCodePluginLrc.lo \
 	erasure-code/lrc/libec_lrc_la-ErasureCodeLrc.lo
-am_libec_lrc_la_OBJECTS = $(am__objects_20) \
+am_libec_lrc_la_OBJECTS = $(am__objects_21) \
 	common/libec_lrc_la-str_map.lo
 libec_lrc_la_OBJECTS = $(am_libec_lrc_la_OBJECTS)
 libec_lrc_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
@@ -2046,7 +2102,7 @@ libec_shec_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 libec_shec_generic_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_3)
-am__objects_21 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
+am__objects_22 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo \
@@ -2067,7 +2123,7 @@ am__objects_21 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo
-am_libec_shec_generic_la_OBJECTS = $(am__objects_21)
+am_libec_shec_generic_la_OBJECTS = $(am__objects_22)
 libec_shec_generic_la_OBJECTS = $(am_libec_shec_generic_la_OBJECTS)
 libec_shec_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2075,7 +2131,7 @@ libec_shec_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_shec_generic_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_shec_neon_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_3)
-am__objects_22 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
+am__objects_23 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo \
@@ -2096,7 +2152,7 @@ am__objects_22 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo
-am_libec_shec_neon_la_OBJECTS = $(am__objects_22) \
+am_libec_shec_neon_la_OBJECTS = $(am__objects_23) \
 	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo \
@@ -2111,7 +2167,7 @@ libec_shec_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_NEON_TRUE@	$(erasure_codelibdir)
 libec_shec_sse3_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_3)
-am__objects_23 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
+am__objects_24 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo \
@@ -2132,7 +2188,7 @@ am__objects_23 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo
-am_libec_shec_sse3_la_OBJECTS = $(am__objects_23)
+am_libec_shec_sse3_la_OBJECTS = $(am__objects_24)
 libec_shec_sse3_la_OBJECTS = $(am_libec_shec_sse3_la_OBJECTS)
 libec_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2142,7 +2198,7 @@ libec_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSSE3_TRUE@	$(erasure_codelibdir)
 libec_shec_sse4_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_3)
-am__objects_24 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
+am__objects_25 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo \
@@ -2163,7 +2219,7 @@ am__objects_24 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo
-am_libec_shec_sse4_la_OBJECTS = $(am__objects_24)
+am_libec_shec_sse4_la_OBJECTS = $(am__objects_25)
 libec_shec_sse4_la_OBJECTS = $(am_libec_shec_sse4_la_OBJECTS)
 libec_shec_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2346,7 +2402,7 @@ am__libmds_la_SOURCES_DIST = mds/Capability.cc mds/MDSDaemon.cc \
 	mds/SnapRealm.cc mds/SnapServer.cc mds/snap.cc \
 	mds/SessionMap.cc mds/MDSContext.cc mds/MDSAuthCaps.cc \
 	mds/MDLog.cc
-am__objects_25 = mds/Capability.lo mds/MDSDaemon.lo mds/MDSRank.lo \
+am__objects_26 = mds/Capability.lo mds/MDSDaemon.lo mds/MDSRank.lo \
 	mds/Beacon.lo mds/locks.lo mds/journal.lo mds/Server.lo \
 	mds/Mutation.lo mds/MDCache.lo mds/RecoveryQueue.lo \
 	mds/StrayManager.lo mds/Locker.lo mds/Migrator.lo \
@@ -2358,7 +2414,7 @@ am__objects_25 = mds/Capability.lo mds/MDSDaemon.lo mds/MDSRank.lo \
 	mds/SessionMap.lo mds/MDSContext.lo mds/MDSAuthCaps.lo \
 	mds/MDLog.lo
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_libmds_la_OBJECTS =  \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__objects_25)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__objects_26)
 libmds_la_OBJECTS = $(am_libmds_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_libmds_la_rpath =
 libmon_types_la_LIBADD =
@@ -2376,11 +2432,11 @@ am__libmsg_la_SOURCES_DIST = msg/Message.cc msg/Messenger.cc \
 	msg/async/EventKqueue.h msg/xio/QueueStrategy.cc \
 	msg/xio/XioConnection.cc msg/xio/XioMessenger.cc \
 	msg/xio/XioMsg.cc msg/xio/XioPortal.cc msg/xio/XioPool.cc
- at LINUX_TRUE@am__objects_26 = msg/async/EventEpoll.lo
- at DARWIN_TRUE@am__objects_27 = msg/async/EventKqueue.lo
- at FREEBSD_TRUE@am__objects_28 = msg/async/EventKqueue.lo
-am__objects_29 =
- at ENABLE_XIO_TRUE@am__objects_30 = msg/xio/QueueStrategy.lo \
+ at LINUX_TRUE@am__objects_27 = msg/async/EventEpoll.lo
+ at DARWIN_TRUE@am__objects_28 = msg/async/EventKqueue.lo
+ at FREEBSD_TRUE@am__objects_29 = msg/async/EventKqueue.lo
+am__objects_30 =
+ at ENABLE_XIO_TRUE@am__objects_31 = msg/xio/QueueStrategy.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioConnection.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioMessenger.lo msg/xio/XioMsg.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.lo msg/xio/XioPool.lo
@@ -2390,9 +2446,9 @@ am_libmsg_la_OBJECTS = msg/Message.lo msg/Messenger.lo \
 	msg/simple/PipeConnection.lo msg/simple/SimpleMessenger.lo \
 	msg/async/AsyncConnection.lo msg/async/AsyncMessenger.lo \
 	msg/async/Event.lo msg/async/net_handler.lo \
-	msg/async/EventSelect.lo $(am__objects_26) $(am__objects_27) \
-	$(am__objects_28) $(am__objects_29) $(am__objects_29) \
-	$(am__objects_29) $(am__objects_30)
+	msg/async/EventSelect.lo $(am__objects_27) $(am__objects_28) \
+	$(am__objects_29) $(am__objects_30) $(am__objects_30) \
+	$(am__objects_30) $(am__objects_31)
 libmsg_la_OBJECTS = $(am_libmsg_la_OBJECTS)
 libos_tp_la_DEPENDENCIES =
 am__libos_tp_la_SOURCES_DIST = tracing/objectstore.c
@@ -2435,13 +2491,13 @@ libperfglue_la_DEPENDENCIES =
 am__libperfglue_la_SOURCES_DIST = perfglue/heap_profiler.cc \
 	perfglue/disabled_heap_profiler.cc perfglue/cpu_profiler.cc \
 	perfglue/disabled_stubs.cc
- at WITH_TCMALLOC_TRUE@am__objects_31 = perfglue/heap_profiler.lo
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__objects_32 = perfglue/heap_profiler.lo
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__objects_33 = perfglue/disabled_heap_profiler.lo
- at WITH_PROFILER_TRUE@am__objects_34 = perfglue/cpu_profiler.lo
- at WITH_PROFILER_FALSE@am__objects_35 = perfglue/disabled_stubs.lo
-am_libperfglue_la_OBJECTS = $(am__objects_31) $(am__objects_32) \
-	$(am__objects_33) $(am__objects_34) $(am__objects_35)
+ at WITH_TCMALLOC_TRUE@am__objects_32 = perfglue/heap_profiler.lo
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__objects_33 = perfglue/heap_profiler.lo
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__objects_34 = perfglue/disabled_heap_profiler.lo
+ at WITH_PROFILER_TRUE@am__objects_35 = perfglue/cpu_profiler.lo
+ at WITH_PROFILER_FALSE@am__objects_36 = perfglue/disabled_stubs.lo
+am_libperfglue_la_OBJECTS = $(am__objects_32) $(am__objects_33) \
+	$(am__objects_34) $(am__objects_35) $(am__objects_36)
 libperfglue_la_OBJECTS = $(am_libperfglue_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_5 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
@@ -2585,28 +2641,51 @@ librbd_api_la_OBJECTS = $(am_librbd_api_la_OBJECTS)
 librbd_internal_la_LIBADD =
 am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 	librbd/AioImageRequest.cc librbd/AioImageRequestWQ.cc \
-	librbd/AioObjectRequest.cc librbd/AsyncFlattenRequest.cc \
-	librbd/AsyncObjectThrottle.cc librbd/AsyncOperation.cc \
-	librbd/AsyncRequest.cc librbd/AsyncResizeRequest.cc \
-	librbd/AsyncTrimRequest.cc librbd/CopyupRequest.cc \
-	librbd/DiffIterate.cc librbd/ImageCtx.cc \
-	librbd/ImageWatcher.cc librbd/internal.cc librbd/Journal.cc \
-	librbd/JournalReplay.cc librbd/LibrbdAdminSocketHook.cc \
-	librbd/LibrbdWriteback.cc librbd/ObjectMap.cc \
-	librbd/RebuildObjectMapRequest.cc
+	librbd/AioObjectRequest.cc librbd/AsyncObjectThrottle.cc \
+	librbd/AsyncOperation.cc librbd/AsyncRequest.cc \
+	librbd/CopyupRequest.cc librbd/DiffIterate.cc \
+	librbd/ExclusiveLock.cc librbd/ImageCtx.cc \
+	librbd/ImageState.cc librbd/ImageWatcher.cc librbd/internal.cc \
+	librbd/Journal.cc librbd/JournalReplay.cc \
+	librbd/LibrbdAdminSocketHook.cc librbd/LibrbdWriteback.cc \
+	librbd/ObjectMap.cc librbd/Utils.cc \
+	librbd/exclusive_lock/AcquireRequest.cc \
+	librbd/exclusive_lock/ReleaseRequest.cc \
+	librbd/image/CloseRequest.cc librbd/image/OpenRequest.cc \
+	librbd/image/RefreshParentRequest.cc \
+	librbd/image/RefreshRequest.cc librbd/image/SetSnapRequest.cc \
+	librbd/object_map/InvalidateRequest.cc \
+	librbd/object_map/LockRequest.cc librbd/object_map/Request.cc \
+	librbd/object_map/RefreshRequest.cc \
+	librbd/object_map/ResizeRequest.cc \
+	librbd/object_map/SnapshotCreateRequest.cc \
+	librbd/object_map/SnapshotRemoveRequest.cc \
+	librbd/object_map/SnapshotRollbackRequest.cc \
+	librbd/object_map/UnlockRequest.cc \
+	librbd/object_map/UpdateRequest.cc \
+	librbd/operation/FlattenRequest.cc \
+	librbd/operation/RebuildObjectMapRequest.cc \
+	librbd/operation/RenameRequest.cc librbd/operation/Request.cc \
+	librbd/operation/ResizeRequest.cc \
+	librbd/operation/SnapshotCreateRequest.cc \
+	librbd/operation/SnapshotProtectRequest.cc \
+	librbd/operation/SnapshotRemoveRequest.cc \
+	librbd/operation/SnapshotRenameRequest.cc \
+	librbd/operation/SnapshotRollbackRequest.cc \
+	librbd/operation/SnapshotUnprotectRequest.cc \
+	librbd/operation/TrimRequest.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_internal_la_OBJECTS = librbd/AioCompletion.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequestWQ.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioObjectRequest.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncObjectThrottle.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncOperation.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncRequest.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncResizeRequest.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncTrimRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/CopyupRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/DiffIterate.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ExclusiveLock.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageState.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.lo \
@@ -2614,7 +2693,36 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.lo
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Utils.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/AcquireRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/ReleaseRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/CloseRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/OpenRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshParentRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/SetSnapRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/InvalidateRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/LockRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/Request.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/RefreshRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/ResizeRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotCreateRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotRemoveRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotRollbackRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/UnlockRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/UpdateRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/FlattenRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/RebuildObjectMapRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/RenameRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/Request.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/ResizeRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotCreateRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotProtectRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRemoveRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRenameRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRollbackRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotUnprotectRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/TrimRequest.lo
 librbd_internal_la_OBJECTS = $(am_librbd_internal_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_internal_la_rpath =
 am__DEPENDENCIES_8 = $(LIBGLOBAL) $(LIBCOMMON) $(am__DEPENDENCIES_1) \
@@ -2692,12 +2800,12 @@ am__librgw_la_SOURCES_DIST = rgw/librgw.cc rgw/rgw_acl.cc \
 	rgw/rgw_json_enc.cc rgw/rgw_user.cc rgw/rgw_bucket.cc \
 	rgw/rgw_tools.cc rgw/rgw_rados.cc rgw/rgw_http_client.cc \
 	rgw/rgw_rest_client.cc rgw/rgw_rest_conn.cc rgw/rgw_op.cc \
-	rgw/rgw_common.cc rgw/rgw_cache.cc rgw/rgw_formats.cc \
-	rgw/rgw_log.cc rgw/rgw_multi.cc rgw/rgw_policy_s3.cc \
-	rgw/rgw_gc.cc rgw/rgw_multi_del.cc rgw/rgw_env.cc \
-	rgw/rgw_cors.cc rgw/rgw_cors_s3.cc rgw/rgw_auth_s3.cc \
-	rgw/rgw_metadata.cc rgw/rgw_replica_log.cc rgw/rgw_keystone.cc \
-	rgw/rgw_quota.cc rgw/rgw_dencoder.cc \
+	rgw/rgw_basic_types.cc rgw/rgw_common.cc rgw/rgw_cache.cc \
+	rgw/rgw_formats.cc rgw/rgw_log.cc rgw/rgw_multi.cc \
+	rgw/rgw_policy_s3.cc rgw/rgw_gc.cc rgw/rgw_multi_del.cc \
+	rgw/rgw_env.cc rgw/rgw_cors.cc rgw/rgw_cors_s3.cc \
+	rgw/rgw_auth_s3.cc rgw/rgw_metadata.cc rgw/rgw_replica_log.cc \
+	rgw/rgw_keystone.cc rgw/rgw_quota.cc rgw/rgw_dencoder.cc \
 	rgw/rgw_object_expirer_core.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_librgw_la_OBJECTS = rgw/librgw_la-librgw.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_acl.lo \
@@ -2716,6 +2824,7 @@ am__librgw_la_SOURCES_DIST = rgw/librgw.cc rgw/rgw_acl.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_rest_client.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_rest_conn.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_op.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_basic_types.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_common.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_cache.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_formats.lo \
@@ -2843,14 +2952,15 @@ libsystest_la_OBJECTS = $(am_libsystest_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	ceph_test_keys$(EXEEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_21 = ceph_test_snap_mapper$(EXEEXT)
 @WITH_BUILD_TESTS_TRUE at am__EXEEXT_22 = test_build_libcommon$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_23 = ceph_scratchtool$(EXEEXT) \
+ at LINUX_TRUE@am__EXEEXT_23 = ceph_test_get_blkdev_size$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_24 = ceph_scratchtool$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtoolpp$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_radosacl$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_24 = ceph-client-debug$(EXEEXT)
- at ENABLE_SERVER_TRUE@am__EXEEXT_25 = ceph-osdomap-tool$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_25 = ceph-client-debug$(EXEEXT)
+ at ENABLE_SERVER_TRUE@am__EXEEXT_26 = ceph-osdomap-tool$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	ceph-monstore-tool$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	ceph-kvstore-tool$(EXEEXT)
-am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
+am__EXEEXT_27 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 	$(am__EXEEXT_4) $(am__EXEEXT_5) $(am__EXEEXT_6) \
 	$(am__EXEEXT_7) $(am__EXEEXT_8) $(am__EXEEXT_9) \
 	$(am__EXEEXT_10) $(am__EXEEXT_11) $(am__EXEEXT_12) \
@@ -2861,38 +2971,39 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 	ceph_test_rewrite_latency$(EXEEXT) ceph_test_crypto$(EXEEXT) \
 	$(am__EXEEXT_22) ceph_bench_log$(EXEEXT) \
 	ceph_test_objectcacher_stress$(EXEEXT) \
-	ceph_test_cfuse_cache_invalidate$(EXEEXT) \
-	ceph_test_get_blkdev_size$(EXEEXT) $(am__EXEEXT_23) \
-	$(am__EXEEXT_24) $(am__EXEEXT_25) ceph_psim$(EXEEXT)
- at WITH_DEBUG_TRUE@am__EXEEXT_27 = $(am__EXEEXT_26)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_28 = radosgw$(EXEEXT) \
+	ceph_test_cfuse_cache_invalidate$(EXEEXT) $(am__EXEEXT_23) \
+	$(am__EXEEXT_24) $(am__EXEEXT_25) $(am__EXEEXT_26) \
+	ceph_psim$(EXEEXT)
+ at WITH_DEBUG_TRUE@am__EXEEXT_28 = $(am__EXEEXT_27)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_29 = radosgw$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-object-expirer$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_29 = rbd-replay$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_30 = rbd-replay-prep$(EXEEXT)
- at ENABLE_CLIENT_TRUE@am__EXEEXT_31 = ceph-dencoder$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_32 = rados$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_33 = rbd$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_34 = ceph-objectstore-tool$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_35 = cephfs-journal-tool$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_30 = rbd-replay$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_31 = rbd-replay-prep$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@am__EXEEXT_32 = ceph-dencoder$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_33 = rados$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_34 = rbd$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd-nbd$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_35 = ceph-objectstore-tool$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_36 = cephfs-journal-tool$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-data-scan$(EXEEXT)
- at ENABLE_CLIENT_TRUE@am__EXEEXT_36 = ceph-syn$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_37 =  \
+ at ENABLE_CLIENT_TRUE@am__EXEEXT_37 = ceph-syn$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_38 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados-config$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_38 = ceph-fuse$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_39 = rbd-fuse$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_40 = cephfs$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_41 = ceph-mon$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_42 = ceph-osd$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__EXEEXT_43 = ceph-mds$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_44 = unittest_erasure_code_plugin$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_39 = ceph-fuse$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_40 = rbd-fuse$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_41 = cephfs$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_42 = ceph-mon$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_43 = ceph-osd$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__EXEEXT_44 = ceph-mds$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_45 = unittest_erasure_code_plugin$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_jerasure$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_jerasure$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__EXEEXT_45 = unittest_erasure_code_isa$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__EXEEXT_46 = unittest_erasure_code_isa$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_isa$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_46 = unittest_erasure_code_lrc$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_47 = unittest_erasure_code_lrc$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_lrc$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_all$(EXEEXT) \
@@ -2900,34 +3011,34 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_arguments$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_shec$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_example$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_47 = unittest_librados$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_48 = unittest_librados$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_journal$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_48 = unittest_rbd_replay$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_49 = unittest_encoding$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_49 = unittest_rbd_replay$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_50 = unittest_encoding$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_base64$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_run_cmd$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_simple_spin$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_libcephfs_config$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_50 = unittest_mon_moncap$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_51 = unittest_mon_moncap$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_pgmap$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_51 = unittest_ecbackend$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_52 = unittest_ecbackend$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osdscrub$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pglog$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_hitset$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pageset$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__EXEEXT_52 = unittest_rocksdb_option_static$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__EXEEXT_53 = unittest_rocksdb_option$(EXEEXT)
- at ENABLE_SERVER_TRUE@am__EXEEXT_54 = unittest_chain_xattr$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__EXEEXT_53 = unittest_rocksdb_option_static$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__EXEEXT_54 = unittest_rocksdb_option$(EXEEXT)
+ at ENABLE_SERVER_TRUE@am__EXEEXT_55 = unittest_chain_xattr$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	unittest_lfnindex$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__EXEEXT_55 = unittest_mds_authcap$(EXEEXT)
- at LINUX_TRUE@am__EXEEXT_56 = unittest_blkdev$(EXEEXT)
-am__EXEEXT_57 = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
-	$(am__EXEEXT_47) $(am__EXEEXT_48) $(am__EXEEXT_49) \
-	$(am__EXEEXT_50) $(am__EXEEXT_51) $(am__EXEEXT_52) \
-	$(am__EXEEXT_53) $(am__EXEEXT_54) $(am__EXEEXT_55) \
-	unittest_addrs$(EXEEXT) $(am__EXEEXT_56) \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__EXEEXT_56 = unittest_mds_authcap$(EXEEXT)
+ at LINUX_TRUE@am__EXEEXT_57 = unittest_blkdev$(EXEEXT)
+am__EXEEXT_58 = $(am__EXEEXT_45) $(am__EXEEXT_46) $(am__EXEEXT_47) \
+	$(am__EXEEXT_48) $(am__EXEEXT_49) $(am__EXEEXT_50) \
+	$(am__EXEEXT_51) $(am__EXEEXT_52) $(am__EXEEXT_53) \
+	$(am__EXEEXT_54) $(am__EXEEXT_55) $(am__EXEEXT_56) \
+	unittest_addrs$(EXEEXT) $(am__EXEEXT_57) \
 	unittest_bloom_filter$(EXEEXT) unittest_histogram$(EXEEXT) \
 	unittest_prioritized_queue$(EXEEXT) unittest_str_map$(EXEEXT) \
 	unittest_sharedptr_registry$(EXEEXT) \
@@ -2956,15 +3067,15 @@ am__EXEEXT_57 = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 	unittest_texttable$(EXEEXT) unittest_on_exit$(EXEEXT) \
 	unittest_readahead$(EXEEXT) unittest_tableformatter$(EXEEXT) \
 	unittest_bit_vector$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_58 = unittest_librbd$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_59 = ceph_erasure_code_non_regression$(EXEEXT)
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__EXEEXT_60 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_59 = unittest_librbd$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_60 = ceph_erasure_code_non_regression$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__EXEEXT_61 =  \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_server$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_client$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_server$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_client$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_61 = get_command_descriptions$(EXEEXT)
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__EXEEXT_62 = mount.ceph$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_62 = get_command_descriptions$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__EXEEXT_63 = mount.ceph$(EXEEXT)
 PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS) $(sbin_PROGRAMS) \
 	$(su_sbin_PROGRAMS)
 am_ceph_authtool_OBJECTS = tools/ceph_authtool.$(OBJEXT)
@@ -2993,8 +3104,9 @@ am__ceph_dencoder_SOURCES_DIST = test/encoding/ceph_dencoder.cc \
 	mds/SessionMap.cc mds/MDSContext.cc mds/MDSAuthCaps.cc \
 	mds/MDLog.cc perfglue/disabled_heap_profiler.cc \
 	perfglue/disabled_stubs.cc rgw/rgw_dencoder.cc rgw/rgw_acl.cc \
-	rgw/rgw_common.cc rgw/rgw_env.cc rgw/rgw_json_enc.cc
-am__objects_36 = mds/ceph_dencoder-Capability.$(OBJEXT) \
+	rgw/rgw_basic_types.cc rgw/rgw_common.cc rgw/rgw_env.cc \
+	rgw/rgw_json_enc.cc
+am__objects_37 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-MDSDaemon.$(OBJEXT) \
 	mds/ceph_dencoder-MDSRank.$(OBJEXT) \
 	mds/ceph_dencoder-Beacon.$(OBJEXT) \
@@ -3026,19 +3138,20 @@ am__objects_36 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-MDSContext.$(OBJEXT) \
 	mds/ceph_dencoder-MDSAuthCaps.$(OBJEXT) \
 	mds/ceph_dencoder-MDLog.$(OBJEXT)
- at ENABLE_CLIENT_TRUE@am__objects_37 = $(am__objects_36)
- at ENABLE_CLIENT_TRUE@am__objects_38 =  \
+ at ENABLE_CLIENT_TRUE@am__objects_38 = $(am__objects_37)
+ at ENABLE_CLIENT_TRUE@am__objects_39 =  \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_dencoder.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_acl.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_basic_types.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_common.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_env.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_json_enc.$(OBJEXT)
-am__objects_39 = $(am__objects_37) \
+am__objects_40 = $(am__objects_38) \
 	perfglue/ceph_dencoder-disabled_heap_profiler.$(OBJEXT) \
 	perfglue/ceph_dencoder-disabled_stubs.$(OBJEXT) \
-	$(am__objects_38)
+	$(am__objects_39)
 @ENABLE_CLIENT_TRUE at am_ceph_dencoder_OBJECTS = test/encoding/ceph_dencoder-ceph_dencoder.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@	$(am__objects_39)
+ at ENABLE_CLIENT_TRUE@	$(am__objects_40)
 ceph_dencoder_OBJECTS = $(am_ceph_dencoder_OBJECTS)
 @ENABLE_CLIENT_TRUE at ceph_dencoder_DEPENDENCIES = $(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@	$(LIBOSD_TYPES) $(LIBOS_TYPES) \
@@ -3057,7 +3170,7 @@ am__ceph_kvstore_tool_SOURCES_DIST = tools/ceph_kvstore_tool.cc
 @ENABLE_SERVER_TRUE at am_ceph_kvstore_tool_OBJECTS = tools/ceph_kvstore_tool-ceph_kvstore_tool.$(OBJEXT)
 ceph_kvstore_tool_OBJECTS = $(am_ceph_kvstore_tool_OBJECTS)
 @WITH_LIBZFS_TRUE at am__DEPENDENCIES_9 = libos_zfs.a
-am__DEPENDENCIES_10 = libkv.a $(am__append_23)
+am__DEPENDENCIES_10 = libkv.a $(am__append_26)
 am__DEPENDENCIES_11 = libos.a $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_9) $(LIBOS_TYPES) $(am__DEPENDENCIES_10)
 @ENABLE_SERVER_TRUE at ceph_kvstore_tool_DEPENDENCIES =  \
@@ -3669,11 +3782,13 @@ ceph_test_filestore_idempotent_sequence_OBJECTS =  \
 @ENABLE_SERVER_TRUE at ceph_test_filestore_idempotent_sequence_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
-am_ceph_test_get_blkdev_size_OBJECTS =  \
-	test/test_get_blkdev_size.$(OBJEXT)
+am__ceph_test_get_blkdev_size_SOURCES_DIST =  \
+	test/test_get_blkdev_size.cc
+ at LINUX_TRUE@am_ceph_test_get_blkdev_size_OBJECTS =  \
+ at LINUX_TRUE@	test/test_get_blkdev_size.$(OBJEXT)
 ceph_test_get_blkdev_size_OBJECTS =  \
 	$(am_ceph_test_get_blkdev_size_OBJECTS)
-ceph_test_get_blkdev_size_DEPENDENCIES = $(LIBCOMMON)
+ at LINUX_TRUE@ceph_test_get_blkdev_size_DEPENDENCIES = $(LIBCOMMON)
 am__ceph_test_ioctls_SOURCES_DIST = client/test_ioctls.c
 @ENABLE_CLIENT_TRUE at am_ceph_test_ioctls_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@	client/test_ioctls.$(OBJEXT)
@@ -3729,13 +3844,13 @@ am__ceph_test_libcephfs_SOURCES_DIST = test/libcephfs/test.cc \
 	test/libcephfs/readdir_r_cb.cc test/libcephfs/caps.cc \
 	test/libcephfs/multiclient.cc test/libcephfs/access.cc \
 	test/libcephfs/flock.cc
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__objects_40 = test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__objects_41 = test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_ceph_test_libcephfs_OBJECTS = test/libcephfs/ceph_test_libcephfs-test.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-readdir_r_cb.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-caps.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-multiclient.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-access.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_40)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_41)
 ceph_test_libcephfs_OBJECTS = $(am_ceph_test_libcephfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCEPHFS) \
@@ -4403,12 +4518,14 @@ am__rbd_SOURCES_DIST = tools/rbd/rbd.cc tools/rbd/ArgumentTypes.cc \
 	tools/rbd/action/ExportDiff.cc tools/rbd/action/Feature.cc \
 	tools/rbd/action/Flatten.cc tools/rbd/action/ImageMeta.cc \
 	tools/rbd/action/Import.cc tools/rbd/action/ImportDiff.cc \
-	tools/rbd/action/Info.cc tools/rbd/action/Kernel.cc \
+	tools/rbd/action/Info.cc tools/rbd/action/Journal.cc \
+	tools/rbd/action/Kernel.cc tools/rbd/action/Nbd.cc \
 	tools/rbd/action/List.cc tools/rbd/action/Lock.cc \
-	tools/rbd/action/MergeDiff.cc tools/rbd/action/ObjectMap.cc \
-	tools/rbd/action/Remove.cc tools/rbd/action/Rename.cc \
-	tools/rbd/action/Resize.cc tools/rbd/action/Snap.cc \
-	tools/rbd/action/Status.cc tools/rbd/action/Watch.cc
+	tools/rbd/action/MergeDiff.cc tools/rbd/action/MirrorPool.cc \
+	tools/rbd/action/ObjectMap.cc tools/rbd/action/Remove.cc \
+	tools/rbd/action/Rename.cc tools/rbd/action/Resize.cc \
+	tools/rbd/action/Snap.cc tools/rbd/action/Status.cc \
+	tools/rbd/action/Watch.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_OBJECTS = tools/rbd/rbd.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/ArgumentTypes.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/IndentStream.$(OBJEXT) \
@@ -4430,10 +4547,13 @@ am__rbd_SOURCES_DIST = tools/rbd/rbd.cc tools/rbd/ArgumentTypes.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Import.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ImportDiff.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Info.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Journal.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Kernel.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Nbd.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/List.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Lock.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/MergeDiff.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/MirrorPool.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ObjectMap.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Remove.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Rename.$(OBJEXT) \
@@ -4442,8 +4562,11 @@ am__rbd_SOURCES_DIST = tools/rbd/rbd.cc tools/rbd/ArgumentTypes.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Status.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Watch.$(OBJEXT)
 rbd_OBJECTS = $(am_rbd_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_DEPENDENCIES = $(LIBKRBD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_DEPENDENCIES = libjournal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBKRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4458,6 +4581,16 @@ rbd_fuse_OBJECTS = $(am_rbd_fuse_OBJECTS)
 rbd_fuse_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(rbd_fuse_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__rbd_nbd_SOURCES_DIST = tools/rbd_nbd/rbd-nbd.cc
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_nbd_OBJECTS = tools/rbd_nbd/rbd_nbd-rbd-nbd.$(OBJEXT)
+rbd_nbd_OBJECTS = $(am_rbd_nbd_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_nbd_DEPENDENCIES = $(LIBRBD) \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1)
+rbd_nbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(rbd_nbd_CXXFLAGS) \
+	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__rbd_replay_SOURCES_DIST = rbd_replay/rbd-replay.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_replay_OBJECTS = rbd_replay/rbd-replay.$(OBJEXT)
 rbd_replay_OBJECTS = $(am_rbd_replay_OBJECTS)
@@ -4512,13 +4645,13 @@ simple_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am__test_build_libcephfs_SOURCES_DIST = test/buildtest_skeleton.cc \
 	osdc/Objecter.cc osdc/ObjectCacher.cc osdc/Filer.cc \
 	osdc/Striper.cc osdc/Journaler.cc
-am__objects_41 = osdc/test_build_libcephfs-Objecter.$(OBJEXT) \
+am__objects_42 = osdc/test_build_libcephfs-Objecter.$(OBJEXT) \
 	osdc/test_build_libcephfs-ObjectCacher.$(OBJEXT) \
 	osdc/test_build_libcephfs-Filer.$(OBJEXT) \
 	osdc/test_build_libcephfs-Striper.$(OBJEXT) \
 	osdc/test_build_libcephfs-Journaler.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_test_build_libcephfs_OBJECTS = test/test_build_libcephfs-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_41)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_42)
 test_build_libcephfs_OBJECTS = $(am_test_build_libcephfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at test_build_libcephfs_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4530,9 +4663,9 @@ test_build_libcephfs_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(test_build_libcephfs_LDFLAGS) $(LDFLAGS) -o $@
 am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/buffer.cc
-am__objects_42 = common/test_build_libcommon-buffer.$(OBJEXT)
+am__objects_43 = common/test_build_libcommon-buffer.$(OBJEXT)
 @WITH_BUILD_TESTS_TRUE at am_test_build_libcommon_OBJECTS = test/test_build_libcommon-buildtest_skeleton.$(OBJEXT) \
- at WITH_BUILD_TESTS_TRUE@	$(am__objects_42)
+ at WITH_BUILD_TESTS_TRUE@	$(am__objects_43)
 test_build_libcommon_OBJECTS = $(am_test_build_libcommon_OBJECTS)
 @WITH_BUILD_TESTS_TRUE at test_build_libcommon_DEPENDENCIES =  \
 @WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_4) \
@@ -4545,10 +4678,10 @@ test_build_libcommon_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(test_build_libcommon_LDFLAGS) $(LDFLAGS) -o $@
 am__test_build_librados_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/buffer.cc librados/librados.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__objects_43 = common/test_build_librados-buffer.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__objects_44 = common/test_build_librados-buffer.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/test_build_librados-librados.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am_test_build_librados_OBJECTS = test/test_build_librados-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_43)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_44)
 test_build_librados_OBJECTS = $(am_test_build_librados_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at test_build_librados_DEPENDENCIES = $(am__DEPENDENCIES_6) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4564,14 +4697,15 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 	rgw/rgw_xml.cc rgw/rgw_usage.cc rgw/rgw_json_enc.cc \
 	rgw/rgw_user.cc rgw/rgw_bucket.cc rgw/rgw_tools.cc \
 	rgw/rgw_rados.cc rgw/rgw_http_client.cc rgw/rgw_rest_client.cc \
-	rgw/rgw_rest_conn.cc rgw/rgw_op.cc rgw/rgw_common.cc \
-	rgw/rgw_cache.cc rgw/rgw_formats.cc rgw/rgw_log.cc \
-	rgw/rgw_multi.cc rgw/rgw_policy_s3.cc rgw/rgw_gc.cc \
-	rgw/rgw_multi_del.cc rgw/rgw_env.cc rgw/rgw_cors.cc \
-	rgw/rgw_cors_s3.cc rgw/rgw_auth_s3.cc rgw/rgw_metadata.cc \
-	rgw/rgw_replica_log.cc rgw/rgw_keystone.cc rgw/rgw_quota.cc \
-	rgw/rgw_dencoder.cc rgw/rgw_object_expirer_core.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__objects_44 = rgw/test_build_librgw-librgw.$(OBJEXT) \
+	rgw/rgw_rest_conn.cc rgw/rgw_op.cc rgw/rgw_basic_types.cc \
+	rgw/rgw_common.cc rgw/rgw_cache.cc rgw/rgw_formats.cc \
+	rgw/rgw_log.cc rgw/rgw_multi.cc rgw/rgw_policy_s3.cc \
+	rgw/rgw_gc.cc rgw/rgw_multi_del.cc rgw/rgw_env.cc \
+	rgw/rgw_cors.cc rgw/rgw_cors_s3.cc rgw/rgw_auth_s3.cc \
+	rgw/rgw_metadata.cc rgw/rgw_replica_log.cc rgw/rgw_keystone.cc \
+	rgw/rgw_quota.cc rgw/rgw_dencoder.cc \
+	rgw/rgw_object_expirer_core.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__objects_45 = rgw/test_build_librgw-librgw.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl_s3.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl_swift.$(OBJEXT) \
@@ -4588,6 +4722,7 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_rest_client.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_rest_conn.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_op.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_basic_types.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_common.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_cache.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_formats.$(OBJEXT) \
@@ -4607,7 +4742,7 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_dencoder.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_object_expirer_core.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_test_build_librgw_OBJECTS = test/test_build_librgw-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__objects_44)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__objects_45)
 test_build_librgw_OBJECTS = $(am_test_build_librgw_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_DEPENDENCIES = $(am__DEPENDENCIES_19) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4909,7 +5044,7 @@ am__unittest_erasure_code_jerasure_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c \
 	erasure-code/jerasure/ErasureCodePluginJerasure.cc \
 	erasure-code/jerasure/ErasureCodeJerasure.cc
-am__objects_45 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEXT) \
+am__objects_46 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-cauchy.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-galois.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-jerasure.$(OBJEXT) \
@@ -4929,7 +5064,7 @@ am__objects_45 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEX
 	erasure-code/jerasure/unittest_erasure_code_jerasure-ErasureCodePluginJerasure.$(OBJEXT) \
 	erasure-code/jerasure/unittest_erasure_code_jerasure-ErasureCodeJerasure.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_jerasure_OBJECTS = test/erasure-code/unittest_erasure_code_jerasure-TestErasureCodeJerasure.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_45)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_46)
 unittest_erasure_code_jerasure_OBJECTS =  \
 	$(am_unittest_erasure_code_jerasure_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_jerasure_DEPENDENCIES =  \
@@ -4947,12 +5082,12 @@ am__unittest_erasure_code_lrc_SOURCES_DIST =  \
 	erasure-code/ErasureCode.cc \
 	erasure-code/lrc/ErasureCodePluginLrc.cc \
 	erasure-code/lrc/ErasureCodeLrc.cc
-am__objects_46 =  \
+am__objects_47 =  \
 	erasure-code/unittest_erasure_code_lrc-ErasureCode.$(OBJEXT) \
 	erasure-code/lrc/unittest_erasure_code_lrc-ErasureCodePluginLrc.$(OBJEXT) \
 	erasure-code/lrc/unittest_erasure_code_lrc-ErasureCodeLrc.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_lrc_OBJECTS = test/erasure-code/unittest_erasure_code_lrc-TestErasureCodeLrc.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_46)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_47)
 unittest_erasure_code_lrc_OBJECTS =  \
 	$(am_unittest_erasure_code_lrc_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_lrc_DEPENDENCIES =  \
@@ -5068,7 +5203,7 @@ am__unittest_erasure_code_shec_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_47 =  \
+am__objects_48 =  \
 	erasure-code/unittest_erasure_code_shec-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShec.$(OBJEXT) \
@@ -5091,7 +5226,7 @@ am__objects_47 =  \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_OBJECTS = test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_47)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_48)
 unittest_erasure_code_shec_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_DEPENDENCIES =  \
@@ -5127,7 +5262,7 @@ am__unittest_erasure_code_shec_all_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_48 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEXT) \
+am__objects_49 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.$(OBJEXT) \
@@ -5149,7 +5284,7 @@ am__objects_48 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEX
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_all_OBJECTS = test/erasure-code/unittest_erasure_code_shec_all-TestErasureCodeShec_all.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_48)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_49)
 unittest_erasure_code_shec_all_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_all_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_DEPENDENCIES =  \
@@ -5185,7 +5320,7 @@ am__unittest_erasure_code_shec_arguments_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_49 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$(OBJEXT) \
+am__objects_50 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.$(OBJEXT) \
@@ -5207,7 +5342,7 @@ am__objects_49 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_arguments_OBJECTS = test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_49)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_50)
 unittest_erasure_code_shec_arguments_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_arguments_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_DEPENDENCIES =  \
@@ -5243,7 +5378,7 @@ am__unittest_erasure_code_shec_thread_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_50 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OBJEXT) \
+am__objects_51 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.$(OBJEXT) \
@@ -5265,7 +5400,7 @@ am__objects_50 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OB
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_thread_OBJECTS = test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_50)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_51)
 unittest_erasure_code_shec_thread_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_thread_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_DEPENDENCIES =  \
@@ -5432,9 +5567,41 @@ unittest_librados_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(unittest_librados_config_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am__unittest_librbd_SOURCES_DIST = test/librbd/test_main.cc \
-	test/librbd/test_mock_fixture.cc
+	test/librbd/test_mock_fixture.cc \
+	test/librbd/test_mock_ExclusiveLock.cc \
+	test/librbd/exclusive_lock/test_mock_AcquireRequest.cc \
+	test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc \
+	test/librbd/object_map/test_mock_InvalidateRequest.cc \
+	test/librbd/object_map/test_mock_LockRequest.cc \
+	test/librbd/object_map/test_mock_RefreshRequest.cc \
+	test/librbd/object_map/test_mock_ResizeRequest.cc \
+	test/librbd/object_map/test_mock_SnapshotCreateRequest.cc \
+	test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc \
+	test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc \
+	test/librbd/object_map/test_mock_UnlockRequest.cc \
+	test/librbd/object_map/test_mock_UpdateRequest.cc \
+	test/librbd/operation/test_mock_SnapshotCreateRequest.cc \
+	test/librbd/operation/test_mock_SnapshotProtectRequest.cc \
+	test/librbd/operation/test_mock_SnapshotRemoveRequest.cc \
+	test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_unittest_librbd_OBJECTS = test/librbd/unittest_librbd-test_main.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_fixture.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_fixture.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_ExclusiveLock.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_LockRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.$(OBJEXT)
 unittest_librbd_OBJECTS = $(am_unittest_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_DEPENDENCIES = librbd_test.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la \
@@ -6079,9 +6246,10 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(monmaptool_SOURCES) $(mount_ceph_SOURCES) \
 	$(osdmaptool_SOURCES) $(rados_SOURCES) $(radosgw_SOURCES) \
 	$(radosgw_admin_SOURCES) $(radosgw_object_expirer_SOURCES) \
-	$(rbd_SOURCES) $(rbd_fuse_SOURCES) $(rbd_replay_SOURCES) \
-	$(rbd_replay_prep_SOURCES) $(simple_client_SOURCES) \
-	$(simple_server_SOURCES) $(test_build_libcephfs_SOURCES) \
+	$(rbd_SOURCES) $(rbd_fuse_SOURCES) $(rbd_nbd_SOURCES) \
+	$(rbd_replay_SOURCES) $(rbd_replay_prep_SOURCES) \
+	$(simple_client_SOURCES) $(simple_server_SOURCES) \
+	$(test_build_libcephfs_SOURCES) \
 	$(test_build_libcommon_SOURCES) $(test_build_librados_SOURCES) \
 	$(test_build_librgw_SOURCES) $(unittest_addrs_SOURCES) \
 	$(unittest_admin_socket_SOURCES) $(unittest_arch_SOURCES) \
@@ -6288,7 +6456,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__ceph_test_filestore_SOURCES_DIST) \
 	$(am__ceph_test_filestore_idempotent_SOURCES_DIST) \
 	$(am__ceph_test_filestore_idempotent_sequence_SOURCES_DIST) \
-	$(ceph_test_get_blkdev_size_SOURCES) \
+	$(am__ceph_test_get_blkdev_size_SOURCES_DIST) \
 	$(am__ceph_test_ioctls_SOURCES_DIST) \
 	$(am__ceph_test_keys_SOURCES_DIST) \
 	$(am__ceph_test_keyvaluedb_SOURCES_DIST) \
@@ -6350,7 +6518,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__radosgw_admin_SOURCES_DIST) \
 	$(am__radosgw_object_expirer_SOURCES_DIST) \
 	$(am__rbd_SOURCES_DIST) $(am__rbd_fuse_SOURCES_DIST) \
-	$(am__rbd_replay_SOURCES_DIST) \
+	$(am__rbd_nbd_SOURCES_DIST) $(am__rbd_replay_SOURCES_DIST) \
 	$(am__rbd_replay_prep_SOURCES_DIST) \
 	$(am__simple_client_SOURCES_DIST) \
 	$(am__simple_server_SOURCES_DIST) \
@@ -6446,8 +6614,7 @@ am__can_run_installinfo = \
     *) (install-info --version) >/dev/null 2>&1;; \
   esac
 am__python_PYTHON_DIST = pybind/ceph_argparse.py pybind/ceph_daemon.py \
-	pybind/rados.py pybind/rbd.py pybind/cephfs.py \
-	pybind/ceph_rest_api.py
+	pybind/rados.py pybind/cephfs.py pybind/ceph_rest_api.py
 am__py_compile = PYTHON=$(PYTHON) $(SHELL) $(py_compile)
 am__pep3147_tweak = \
   sed -e 's|\.py$$||' -e 's|[^/]*$$|__pycache__/&.*.py|'
@@ -6609,7 +6776,8 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
 	common/SubProcess.h common/valgrind.h \
-	common/TracepointProvider.h common/address_helper.h \
+	common/TracepointProvider.h common/event_socket.h \
+	common/PluginRegistry.h common/address_helper.h \
 	common/secret.h msg/Connection.h msg/Dispatcher.h \
 	msg/Message.h msg/Messenger.h msg/SimplePolicyMessenger.h \
 	msg/msg_types.h msg/simple/Accepter.h \
@@ -6682,7 +6850,7 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	include/Context.h include/CompatSet.h include/Distribution.h \
 	include/Spinlock.h include/addr_parsing.h include/assert.h \
 	include/atomic.h include/bitmapper.h include/blobhash.h \
-	include/buffer.h include/byteorder.h \
+	include/buffer.h include/buffer_fwd.h include/byteorder.h \
 	include/cephfs/libcephfs.h include/ceph_features.h \
 	include/ceph_frag.h include/ceph_fs.h include/ceph_hash.h \
 	include/cmp.h include/color.h include/compat.h \
@@ -6701,6 +6869,7 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	include/rados/rados_types.hpp include/rados/librados.hpp \
 	include/rados/librgw.h include/rados/page.h \
 	include/rados/crc32c.h include/rados/buffer.h \
+	include/rados/buffer_fwd.h \
 	include/radosstriper/libradosstriper.h \
 	include/radosstriper/libradosstriper.hpp \
 	include/rbd/features.h include/rbd/librbd.h \
@@ -6708,7 +6877,7 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	include/util.h include/stat.h include/on_exit.h \
 	include/memory.h include/rados/memory.h \
 	include/unordered_set.h include/unordered_map.h \
-	include/timegm.h librados/snap_set_diff.h \
+	include/timegm.h include/event_type.h librados/snap_set_diff.h \
 	librados/AioCompletionImpl.h librados/IoCtxImpl.h \
 	librados/PoolAsyncCompletionImpl.h librados/RadosClient.h \
 	librados/RadosXattrIter.h librados/ListObjectImpl.h \
@@ -6722,39 +6891,63 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	journal/ReplayEntry.h journal/ReplayHandler.h journal/Utils.h \
 	librbd/AioCompletion.h librbd/AioImageRequest.h \
 	librbd/AioImageRequestWQ.h librbd/AioObjectRequest.h \
-	librbd/AsyncFlattenRequest.h librbd/AsyncObjectThrottle.h \
-	librbd/AsyncOperation.h librbd/AsyncRequest.h \
-	librbd/AsyncResizeRequest.h librbd/AsyncTrimRequest.h \
-	librbd/CopyupRequest.h librbd/DiffIterate.h librbd/ImageCtx.h \
-	librbd/ImageWatcher.h librbd/internal.h librbd/Journal.h \
-	librbd/JournalReplay.h librbd/JournalTypes.h \
+	librbd/AsyncObjectThrottle.h librbd/AsyncOperation.h \
+	librbd/AsyncRequest.h librbd/CopyupRequest.h \
+	librbd/DiffIterate.h librbd/ExclusiveLock.h librbd/ImageCtx.h \
+	librbd/ImageState.h librbd/ImageWatcher.h librbd/internal.h \
+	librbd/Journal.h librbd/JournalReplay.h librbd/JournalTypes.h \
 	librbd/LibrbdAdminSocketHook.h librbd/LibrbdWriteback.h \
-	librbd/ObjectMap.h librbd/parent_types.h \
-	librbd/RebuildObjectMapRequest.h librbd/SnapInfo.h \
-	librbd/TaskFinisher.h librbd/WatchNotifyTypes.h rgw/rgw_acl.h \
-	rgw/rgw_acl_s3.h rgw/rgw_acl_swift.h rgw/rgw_client_io.h \
-	rgw/rgw_fcgi.h rgw/rgw_xml.h rgw/rgw_cache.h rgw/rgw_common.h \
-	rgw/rgw_cors.h rgw/rgw_cors_s3.h rgw/rgw_cors_swift.h \
-	rgw/rgw_string.h rgw/rgw_formats.h rgw/rgw_http_errors.h \
-	rgw/rgw_log.h rgw/rgw_loadgen.h rgw/rgw_multi.h \
-	rgw/rgw_policy_s3.h rgw/rgw_gc.h rgw/rgw_metadata.h \
-	rgw/rgw_multi_del.h rgw/rgw_object_expirer_core.h rgw/rgw_op.h \
-	rgw/rgw_orphan.h rgw/rgw_http_client.h rgw/rgw_swift.h \
-	rgw/rgw_swift_auth.h rgw/rgw_quota.h rgw/rgw_rados.h \
-	rgw/rgw_replica_log.h rgw/rgw_resolve.h rgw/rgw_rest.h \
-	rgw/rgw_rest_swift.h rgw/rgw_rest_s3.h rgw/rgw_auth_s3.h \
-	rgw/rgw_rest_admin.h rgw/rgw_rest_usage.h rgw/rgw_rest_user.h \
-	rgw/rgw_rest_bucket.h rgw/rgw_rest_client.h \
-	rgw/rgw_rest_conn.h rgw/rgw_tools.h rgw/rgw_rest_metadata.h \
-	rgw/rgw_rest_log.h rgw/rgw_rest_opstate.h \
-	rgw/rgw_rest_replica_log.h rgw/rgw_rest_config.h \
-	rgw/rgw_usage.h rgw/rgw_user.h rgw/rgw_bucket.h \
-	rgw/rgw_keystone.h rgw/rgw_civetweb.h rgw/rgw_civetweb_log.h \
-	civetweb/civetweb.h civetweb/include/civetweb.h \
-	civetweb/include/civetweb_conf.h civetweb/src/md5.h \
-	cls/lock/cls_lock_types.h cls/lock/cls_lock_ops.h \
-	cls/lock/cls_lock_client.h cls/numops/cls_numops_client.h \
-	cls/rbd/cls_rbd.h cls/rbd/cls_rbd_client.h \
+	librbd/ObjectMap.h librbd/parent_types.h librbd/SnapInfo.h \
+	librbd/TaskFinisher.h librbd/Utils.h librbd/WatchNotifyTypes.h \
+	librbd/exclusive_lock/AcquireRequest.h \
+	librbd/exclusive_lock/ReleaseRequest.h \
+	librbd/image/CloseRequest.h librbd/image/OpenRequest.h \
+	librbd/image/RefreshParentRequest.h \
+	librbd/image/RefreshRequest.h librbd/image/SetSnapRequest.h \
+	librbd/object_map/InvalidateRequest.h \
+	librbd/object_map/LockRequest.h librbd/object_map/Request.h \
+	librbd/object_map/RefreshRequest.h \
+	librbd/object_map/ResizeRequest.h \
+	librbd/object_map/SnapshotCreateRequest.h \
+	librbd/object_map/SnapshotRemoveRequest.h \
+	librbd/object_map/SnapshotRollbackRequest.h \
+	librbd/object_map/UnlockRequest.h \
+	librbd/object_map/UpdateRequest.h \
+	librbd/operation/FlattenRequest.h \
+	librbd/operation/RebuildObjectMapRequest.h \
+	librbd/operation/RenameRequest.h librbd/operation/Request.h \
+	librbd/operation/ResizeRequest.h \
+	librbd/operation/SnapshotCreateRequest.h \
+	librbd/operation/SnapshotProtectRequest.h \
+	librbd/operation/SnapshotRemoveRequest.h \
+	librbd/operation/SnapshotRenameRequest.h \
+	librbd/operation/SnapshotRollbackRequest.h \
+	librbd/operation/SnapshotUnprotectRequest.h \
+	librbd/operation/TrimRequest.h rgw/rgw_acl.h rgw/rgw_acl_s3.h \
+	rgw/rgw_acl_swift.h rgw/rgw_client_io.h rgw/rgw_fcgi.h \
+	rgw/rgw_xml.h rgw/rgw_basic_types.h rgw/rgw_cache.h \
+	rgw/rgw_common.h rgw/rgw_cors.h rgw/rgw_cors_s3.h \
+	rgw/rgw_cors_swift.h rgw/rgw_string.h rgw/rgw_formats.h \
+	rgw/rgw_http_errors.h rgw/rgw_log.h rgw/rgw_loadgen.h \
+	rgw/rgw_multi.h rgw/rgw_policy_s3.h rgw/rgw_gc.h \
+	rgw/rgw_metadata.h rgw/rgw_multi_del.h \
+	rgw/rgw_object_expirer_core.h rgw/rgw_op.h rgw/rgw_orphan.h \
+	rgw/rgw_http_client.h rgw/rgw_swift.h rgw/rgw_swift_auth.h \
+	rgw/rgw_quota.h rgw/rgw_rados.h rgw/rgw_replica_log.h \
+	rgw/rgw_resolve.h rgw/rgw_rest.h rgw/rgw_rest_swift.h \
+	rgw/rgw_rest_s3.h rgw/rgw_auth_s3.h rgw/rgw_rest_admin.h \
+	rgw/rgw_rest_usage.h rgw/rgw_rest_user.h rgw/rgw_rest_bucket.h \
+	rgw/rgw_rest_client.h rgw/rgw_rest_conn.h rgw/rgw_tools.h \
+	rgw/rgw_rest_metadata.h rgw/rgw_rest_log.h \
+	rgw/rgw_rest_opstate.h rgw/rgw_rest_replica_log.h \
+	rgw/rgw_rest_config.h rgw/rgw_usage.h rgw/rgw_user.h \
+	rgw/rgw_bucket.h rgw/rgw_keystone.h rgw/rgw_civetweb.h \
+	rgw/rgw_civetweb_log.h civetweb/civetweb.h \
+	civetweb/include/civetweb.h civetweb/include/civetweb_conf.h \
+	civetweb/src/md5.h cls/lock/cls_lock_types.h \
+	cls/lock/cls_lock_ops.h cls/lock/cls_lock_client.h \
+	cls/numops/cls_numops_client.h cls/rbd/cls_rbd.h \
+	cls/rbd/cls_rbd_client.h cls/rbd/cls_rbd_types.h \
 	cls/refcount/cls_refcount_ops.h \
 	cls/refcount/cls_refcount_client.h \
 	cls/version/cls_version_types.h cls/version/cls_version_ops.h \
@@ -6798,11 +6991,17 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	test/librados_test_stub/TestMemIoCtxImpl.h \
 	test/librados_test_stub/TestIoCtxImpl.h \
 	test/librbd/test_fixture.h test/librbd/test_mock_fixture.h \
-	test/librbd/test_support.h test/librbd/mock/MockContextWQ.h \
+	test/librbd/test_support.h \
+	test/librbd/mock/MockAioImageRequestWQ.h \
+	test/librbd/mock/MockContextWQ.h \
+	test/librbd/mock/MockExclusiveLock.h \
 	test/librbd/mock/MockImageCtx.h \
 	test/librbd/mock/MockImageWatcher.h \
-	test/librbd/mock/MockObjectMap.h test/perf_helper.h \
-	test/bench/backend.h test/bench/bencher.h \
+	test/librbd/mock/MockJournal.h \
+	test/librbd/mock/MockObjectMap.h \
+	test/librbd/mock/MockReadahead.h \
+	test/librbd/object_map/mock/MockInvalidateRequest.h \
+	test/perf_helper.h test/bench/backend.h test/bench/bencher.h \
 	test/bench/detailed_stat_collector.h test/bench/distribution.h \
 	test/bench/dumb_backend.h test/bench/rados_backend.h \
 	test/bench/rbd_backend.h test/bench/stat_collector.h \
@@ -7074,9 +7273,9 @@ am__relativize = \
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@ $(AM_COMMON_CFLAGS) -ftemplate-depth-1024 \
-	-Wnon-virtual-dtor -Wno-invalid-offsetof $(HARDENING_CFLAGS) \
-	$(am__append_7) $(am__append_10) $(am__append_84) \
-	$(am__append_87)
+	-Wnon-virtual-dtor -Wno-invalid-offsetof $(am__append_7) \
+	$(am__append_10) $(am__append_13) $(am__append_87) \
+	$(am__append_90)
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
 ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
@@ -7107,6 +7306,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -7193,6 +7393,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
@@ -7276,10 +7477,10 @@ user_rgw = @user_rgw@
 AUTOMAKE_OPTIONS = gnu subdir-objects
 SUBDIRS = ocf java
 DIST_SUBDIRS = gmock ocf java
-BUILT_SOURCES = $(am__append_229) $(am__append_249)
+BUILT_SOURCES = $(am__append_234) $(am__append_256)
 
 # extra bits
-EXTRA_DIST = $(am__append_25) ceph-detect-init/AUTHORS.rst \
+EXTRA_DIST = $(am__append_28) ceph-detect-init/AUTHORS.rst \
 	ceph-detect-init/ceph_detect_init/centos/__init__.py \
 	ceph-detect-init/ceph_detect_init/exc.py \
 	ceph-detect-init/ceph_detect_init/main.py \
@@ -7922,7 +8123,8 @@ EXTRA_DIST = $(am__append_25) ceph-detect-init/AUTHORS.rst \
 	rocksdb/DUMP_FORMAT.md rocksdb/INSTALL.md \
 	rocksdb/ROCKSDB_LITE.md rocksdb/WINDOWS_PORT.md \
 	rocksdb/appveyordailytests.yml rocksdb/AUTHORS \
-	tracing/tracing-common.h $(srcdir)/$(shell_scripts:%=%.in) \
+	tracing/tracing-common.h $(srcdir)/pybind/setup.py \
+	$(srcdir)/pybind/rbd.pyx $(srcdir)/$(shell_scripts:%=%.in) \
 	$(srcdir)/vstart.sh $(srcdir)/stop.sh ceph-run \
 	$(srcdir)/ceph-osd-prestart.sh $(srcdir)/ceph_common.sh \
 	$(srcdir)/init-radosgw $(srcdir)/init-rbdmap \
@@ -7982,10 +8184,10 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	crush/CrushWrapper.h crush/CrushWrapper.i crush/builder.h \
 	crush/crush.h crush/crush_compat.h crush/crush_ln_table.h \
 	crush/grammar.h crush/hash.h crush/mapper.h crush/sample.txt \
-	crush/types.h $(am__append_27) $(am__append_31) \
-	$(am__append_34) $(am__append_38) $(am__append_40) \
-	$(am__append_44) $(am__append_52) $(am__append_54) \
-	$(am__append_56) \
+	crush/types.h $(am__append_30) $(am__append_34) \
+	$(am__append_37) $(am__append_41) $(am__append_43) \
+	$(am__append_47) $(am__append_55) $(am__append_57) \
+	$(am__append_59) \
 	erasure-code/jerasure/gf-complete/include/gf_complete.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
 	erasure-code/jerasure/gf-complete/include/gf_int.h \
@@ -8015,11 +8217,11 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	erasure-code/jerasure/gf-complete/include/gf_rand.h \
 	erasure-code/jerasure/gf-complete/include/gf_method.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
-	$(am__append_74) erasure-code/ErasureCode.h \
+	$(am__append_77) erasure-code/ErasureCode.h \
 	erasure-code/ErasureCodeInterface.h \
 	erasure-code/ErasureCodePlugin.h osdc/Filer.h osdc/Journaler.h \
 	osdc/ObjectCacher.h osdc/Objecter.h osdc/Striper.h \
-	osdc/WritebackHandler.h $(am__append_78) $(am__append_80) \
+	osdc/WritebackHandler.h $(am__append_81) $(am__append_83) \
 	global/pidfile.h global/global_init.h global/global_context.h \
 	global/signal_handler.h json_spirit/json_spirit.h \
 	json_spirit/json_spirit_error_position.h \
@@ -8069,7 +8271,8 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
 	common/SubProcess.h common/valgrind.h \
-	common/TracepointProvider.h $(am__append_100) common/secret.h \
+	common/TracepointProvider.h common/event_socket.h \
+	common/PluginRegistry.h $(am__append_104) common/secret.h \
 	msg/Connection.h msg/Dispatcher.h msg/Message.h \
 	msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
 	msg/simple/Accepter.h msg/simple/DispatchQueue.h \
@@ -8077,7 +8280,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
 	msg/async/AsyncMessenger.h msg/async/Event.h \
 	msg/async/EventEpoll.h msg/async/EventSelect.h \
-	msg/async/net_handler.h $(am__append_109) messages/MAuth.h \
+	msg/async/net_handler.h $(am__append_113) messages/MAuth.h \
 	messages/MAuthReply.h messages/MCacheExpire.h \
 	messages/MClientCaps.h messages/MClientCapRelease.h \
 	messages/MClientLease.h messages/MClientReconnect.h \
@@ -8138,7 +8341,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	include/Context.h include/CompatSet.h include/Distribution.h \
 	include/Spinlock.h include/addr_parsing.h include/assert.h \
 	include/atomic.h include/bitmapper.h include/blobhash.h \
-	include/buffer.h include/byteorder.h \
+	include/buffer.h include/buffer_fwd.h include/byteorder.h \
 	include/cephfs/libcephfs.h include/ceph_features.h \
 	include/ceph_frag.h include/ceph_fs.h include/ceph_hash.h \
 	include/cmp.h include/color.h include/compat.h \
@@ -8157,6 +8360,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	include/rados/rados_types.hpp include/rados/librados.hpp \
 	include/rados/librgw.h include/rados/page.h \
 	include/rados/crc32c.h include/rados/buffer.h \
+	include/rados/buffer_fwd.h \
 	include/radosstriper/libradosstriper.h \
 	include/radosstriper/libradosstriper.hpp \
 	include/rbd/features.h include/rbd/librbd.h \
@@ -8164,15 +8368,16 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	include/util.h include/stat.h include/on_exit.h \
 	include/memory.h include/rados/memory.h \
 	include/unordered_set.h include/unordered_map.h \
-	include/timegm.h $(am__append_115) $(am__append_118) \
-	$(am__append_119) $(am__append_124) $(am__append_130) \
-	$(am__append_134) $(am__append_137) $(am__append_138) \
-	$(am__append_144) $(am__append_166) $(am__append_182) \
-	$(am__append_188) $(am__append_200) test/bench/backend.h \
-	test/bench/bencher.h test/bench/detailed_stat_collector.h \
-	test/bench/distribution.h test/bench/dumb_backend.h \
-	test/bench/rados_backend.h test/bench/rbd_backend.h \
-	test/bench/stat_collector.h test/bench/testfilestore_backend.h \
+	include/timegm.h include/event_type.h $(am__append_119) \
+	$(am__append_122) $(am__append_123) $(am__append_128) \
+	$(am__append_134) $(am__append_138) $(am__append_141) \
+	$(am__append_142) $(am__append_148) $(am__append_170) \
+	$(am__append_186) $(am__append_192) $(am__append_204) \
+	test/bench/backend.h test/bench/bencher.h \
+	test/bench/detailed_stat_collector.h test/bench/distribution.h \
+	test/bench/dumb_backend.h test/bench/rados_backend.h \
+	test/bench/rbd_backend.h test/bench/stat_collector.h \
+	test/bench/testfilestore_backend.h \
 	test/common/ObjectContents.h test/encoding/types.h \
 	test/objectstore/DeterministicOpSequence.h \
 	test/objectstore/FileStoreDiff.h \
@@ -8191,7 +8396,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	test/system/st_rados_list_objects.h \
 	test/system/st_rados_notify.h test/system/st_rados_watch.h \
 	test/system/systest_runnable.h test/system/systest_settings.h \
-	test/unit.h test/journal/RadosTestFixture.h $(am__append_221) \
+	test/unit.h test/journal/RadosTestFixture.h $(am__append_226) \
 	tools/cephfs/JournalTool.h tools/cephfs/JournalScanner.h \
 	tools/cephfs/JournalFilter.h tools/cephfs/EventOutput.h \
 	tools/cephfs/Resetter.h tools/cephfs/Dumper.h \
@@ -8204,42 +8409,42 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	sample.ceph.conf bash_completion/ceph bash_completion/rados \
 	bash_completion/rbd bash_completion/radosgw-admin \
 	mount/canonicalize.c mount/mtab.c objclass/objclass.h
-bin_SCRIPTS = $(am__append_24) $(am__append_236) $(am__append_246) \
-	$(am__append_254)
+bin_SCRIPTS = $(am__append_27) $(am__append_244) $(am__append_253) \
+	$(am__append_261)
 sbin_SCRIPTS = 
-su_sbin_SCRIPTS = $(am__append_251)
+su_sbin_SCRIPTS = $(am__append_258)
 dist_bin_SCRIPTS = 
-lib_LTLIBRARIES = $(am__append_114) $(am__append_117) \
-	$(am__append_123) $(am__append_228) $(am__append_244) \
-	$(am__append_245)
+lib_LTLIBRARIES = $(am__append_118) $(am__append_121) \
+	$(am__append_127) $(am__append_233) $(am__append_251) \
+	$(am__append_252)
 noinst_LTLIBRARIES = libarch.la libauth.la libcrush.la libmon_types.la \
-	$(am__append_43) libosd_types.la liberasure_code.la libosdc.la \
-	$(am__append_77) $(am__append_79) libglobal.la \
+	$(am__append_46) libosd_types.la liberasure_code.la libosdc.la \
+	$(am__append_80) $(am__append_82) libglobal.la \
 	libjson_spirit.la liblog.la libperfglue.la \
-	libcommon_internal.la libcommon_crc.la $(am__append_98) \
-	libcommon.la $(am__append_101) libmsg.la $(am__append_110) \
-	librbd_types.la $(am__append_121) $(am__append_126) \
-	$(am__append_131) $(am__append_139) $(am__append_175) \
-	$(am__append_185) $(am__append_190) $(am__append_216) \
-	libcompressor.la $(am__append_238)
-noinst_LIBRARIES = $(am__append_26) $(am__append_39) libos_types.a \
-	$(am__append_51) $(am__append_53) $(am__append_55) \
-	$(am__append_133)
-radoslib_LTLIBRARIES = $(am__append_135) $(am__append_136)
+	libcommon_internal.la libcommon_crc.la $(am__append_102) \
+	libcommon.la $(am__append_105) libmsg.la $(am__append_114) \
+	librbd_types.la $(am__append_125) $(am__append_130) \
+	$(am__append_135) $(am__append_143) $(am__append_179) \
+	$(am__append_189) $(am__append_194) $(am__append_220) \
+	libcompressor.la $(am__append_245)
+noinst_LIBRARIES = $(am__append_29) $(am__append_42) libos_types.a \
+	$(am__append_54) $(am__append_56) $(am__append_58) \
+	$(am__append_137)
+radoslib_LTLIBRARIES = $(am__append_139) $(am__append_140)
 
 # like bin_PROGRAMS, but these targets are only built for debug builds
-bin_DEBUGPROGRAMS = $(am__append_81) $(am__append_129) \
-	$(am__append_146) $(am__append_176) $(am__append_177) \
-	$(am__append_178) $(am__append_179) $(am__append_181) \
-	$(am__append_183) $(am__append_189) $(am__append_191) \
-	$(am__append_192) $(am__append_195) $(am__append_197) \
-	$(am__append_198) $(am__append_199) $(am__append_201) \
-	$(am__append_202) $(am__append_203) $(am__append_204) \
-	$(am__append_210) ceph_test_timers ceph_test_signal_handlers \
-	ceph_test_rewrite_latency ceph_test_crypto $(am__append_215) \
+bin_DEBUGPROGRAMS = $(am__append_84) $(am__append_133) \
+	$(am__append_150) $(am__append_180) $(am__append_181) \
+	$(am__append_182) $(am__append_183) $(am__append_185) \
+	$(am__append_187) $(am__append_193) $(am__append_195) \
+	$(am__append_196) $(am__append_199) $(am__append_201) \
+	$(am__append_202) $(am__append_203) $(am__append_205) \
+	$(am__append_206) $(am__append_207) $(am__append_208) \
+	$(am__append_214) ceph_test_timers ceph_test_signal_handlers \
+	ceph_test_rewrite_latency ceph_test_crypto $(am__append_219) \
 	ceph_bench_log ceph_test_objectcacher_stress \
-	ceph_test_cfuse_cache_invalidate ceph_test_get_blkdev_size \
-	$(am__append_219) $(am__append_223) $(am__append_224) \
+	ceph_test_cfuse_cache_invalidate $(am__append_223) \
+	$(am__append_224) $(am__append_228) $(am__append_229) \
 	ceph_psim
 
 # like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin
@@ -8249,11 +8454,11 @@ ceph_sbindir = $(sbindir)
 su_sbindir = /sbin
 
 # C/C++ tests to build and executed will be appended to this
-check_TESTPROGRAMS = $(am__append_152) $(am__append_156) \
-	$(am__append_159) $(am__append_180) $(am__append_184) \
-	$(am__append_193) $(am__append_206) $(am__append_207) \
-	$(am__append_211) $(am__append_212) $(am__append_213) \
-	$(am__append_214) unittest_addrs $(am__append_218) \
+check_TESTPROGRAMS = $(am__append_156) $(am__append_160) \
+	$(am__append_163) $(am__append_184) $(am__append_188) \
+	$(am__append_197) $(am__append_210) $(am__append_211) \
+	$(am__append_215) $(am__append_216) $(am__append_217) \
+	$(am__append_218) unittest_addrs $(am__append_222) \
 	unittest_bloom_filter unittest_histogram \
 	unittest_prioritized_queue unittest_str_map \
 	unittest_sharedptr_registry unittest_shared_cache \
@@ -8297,8 +8502,8 @@ check_TESTPROGRAMS = $(am__append_152) $(am__append_156) \
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
 #
-check_SCRIPTS = ceph-detect-init/run-tox.sh $(am__append_143) \
-	$(am__append_187) test/ceph_objectstore_tool.py \
+check_SCRIPTS = ceph-detect-init/run-tox.sh $(am__append_147) \
+	$(am__append_191) test/ceph_objectstore_tool.py \
 	test/test-ceph-helpers.sh test/cephtool-test-osd.sh \
 	test/cephtool-test-mon.sh test/cephtool-test-mds.sh \
 	test/cephtool-test-rados.sh unittest_bufferlist.sh \
@@ -8322,6 +8527,7 @@ HARDENING_CFLAGS = \
                    -g \
                    -pipe \
                    -Wall \
+                   -Wp,-U_FORTIFY_SOURCE \
                    -Wp,-D_FORTIFY_SOURCE=2 \
                    -fexceptions \
                    --param=ssp-buffer-size=4 \
@@ -8348,8 +8554,8 @@ AM_COMMON_CFLAGS = \
 	-fno-strict-aliasing \
 	-fsigned-char
 
-AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS) $(am__append_9) \
-	$(am__append_83) $(am__append_86)
+AM_CFLAGS = $(AM_COMMON_CFLAGS) $(am__append_6) $(am__append_12) \
+	$(am__append_86) $(am__append_89)
 AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 
 # note: this is position dependant, it affects the -l options that
@@ -8360,45 +8566,45 @@ AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 # http://www.gentoo.org/proj/en/qa/asneeded.xml
 # http://gcc.gnu.org/ml/gcc-help/2010-12/msg00338.html
 # http://sigquit.wordpress.com/2011/02/16/why-asneeded-doesnt-work-as-expected-for-your-libraries-on-your-autotools-project/
-AM_LDFLAGS = $(am__append_6) $(am__append_8)
+AM_LDFLAGS = $(am__append_8) $(am__append_9) $(am__append_11)
 AM_CCASFLAGS = -f elf64
 
 #####################
-EXTRALIBS = -lm $(am__append_11) $(am__append_12) $(am__append_13) \
-	$(am__append_22)
+EXTRALIBS = -lm $(am__append_14) $(am__append_15) $(am__append_16) \
+	$(am__append_25)
 LIBGLOBAL = libglobal.la
 LIBCOMMON = libcommon.la
 LIBSECRET = libsecret.la
 LIBARCH = libarch.la
-LIBPERFGLUE = libperfglue.la $(am__append_16) $(am__append_17)
+LIBPERFGLUE = libperfglue.la $(am__append_19) $(am__append_20)
 LIBAUTH = libauth.la
 LIBMSG = libmsg.la
 LIBCRUSH = libcrush.la
 LIBCOMPRESSOR = libcompressor.la -lsnappy
 LIBJSON_SPIRIT = libjson_spirit.la
-LIBKV = libkv.a $(am__append_23) -lbz2 -lz -lleveldb -lsnappy
+LIBKV = libkv.a $(am__append_26) -lbz2 -lz -lleveldb -lsnappy
 LIBLOG = liblog.la
-LIBOS = libos.a $(am__append_14) $(am__append_15) $(LIBOS_TYPES) \
+LIBOS = libos.a $(am__append_17) $(am__append_18) $(LIBOS_TYPES) \
 	$(LIBKV)
 LIBOS_TYPES = libos_types.a
 
 # Libosd always needs osdc and os
 
 # OSD needs types
-LIBOSD = libosd.a $(am__append_19) $(LIBOSDC) $(LIBOS) $(LIBPERFGLUE) \
+LIBOSD = libosd.a $(am__append_22) $(LIBOSDC) $(LIBOS) $(LIBPERFGLUE) \
 	$(LIBOSD_TYPES) $(LIBOS_TYPES)
 LIBOSD_TYPES = libosd_types.la
 LIBOSDC = libosdc.la
 
 # These have references to syms like ceph_using_tcmalloc(), glue libperfglue to them
-LIBMON = libmon.a $(am__append_18) $(LIBPERFGLUE) $(LIBMON_TYPES)
+LIBMON = libmon.a $(am__append_21) $(LIBPERFGLUE) $(LIBMON_TYPES)
 LIBMON_TYPES = libmon_types.la
-LIBMDS = libmds.la $(am__append_20) $(LIBPERFGLUE)
+LIBMDS = libmds.la $(am__append_23) $(LIBPERFGLUE)
 LIBCLIENT = libclient.la
 LIBCLIENT_FUSE = libclient_fuse.la
 LIBRADOS = librados.la
 LIBRADOSSTRIPER = libradosstriper.la
-LIBRGW = librgw.la $(am__append_21)
+LIBRGW = librgw.la $(am__append_24)
 LIBCIVETWEB = libcivetweb.la
 LIBRBD = librbd.la
 LIBRBD_TYPES = librbd_types.la
@@ -8413,20 +8619,26 @@ CEPH_GLOBAL = $(LIBGLOBAL) $(LIBCOMMON) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXT
 
 # important; libmsg before libauth!
 LIBCOMMON_DEPS = libcommon_internal.la libcommon_crc.la \
-	$(am__append_97) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
+	$(am__append_101) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
 	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
-	$(BOOST_RANDOM_LIBS) $(am__append_99)
-LIBRADOS_DEPS = $(am__append_111)
-LIBRGW_DEPS = $(am__append_127)
+	$(BOOST_RANDOM_LIBS) $(am__append_103)
+LIBRADOS_DEPS = $(am__append_115)
+LIBRGW_DEPS = $(am__append_131)
 
 # This is used by the dencoder test
 
 # Do not use TCMALLOC with dencoder
-DENCODER_SOURCES = $(am__append_41) perfglue/disabled_heap_profiler.cc \
-	perfglue/disabled_stubs.cc $(am__append_125)
-DENCODER_DEPS = $(am__append_42) $(am__append_120) $(am__append_132) \
-	$(am__append_140)
+DENCODER_SOURCES = $(am__append_44) perfglue/disabled_heap_profiler.cc \
+	perfglue/disabled_stubs.cc $(am__append_129)
+DENCODER_DEPS = $(am__append_45) $(am__append_124) $(am__append_136) \
+	$(am__append_144)
 radoslibdir = $(libdir)/rados-classes
+LOCAL_ALL = ceph-detect-init-all $(am__append_235)
+LOCAL_CLEAN = ceph-detect-init-clean $(am__append_236) \
+	base-clean-local
+LOCAL_INSTALLDATA = ceph-detect-init-install-data \
+	base-install-data-local
+LOCAL_INSTALLEXEC = $(am__append_237)
 libarch_la_SOURCES = \
 	arch/intel.c \
 	arch/arm.c \
@@ -8460,13 +8672,13 @@ libcrush_la_SOURCES = \
 	crush/CrushTester.cc
 
 @ENABLE_SERVER_TRUE at libkv_a_SOURCES = kv/KeyValueDB.cc \
- at ENABLE_SERVER_TRUE@	kv/LevelDBStore.cc $(am__append_29) \
- at ENABLE_SERVER_TRUE@	$(am__append_32) $(am__append_35)
+ at ENABLE_SERVER_TRUE@	kv/LevelDBStore.cc $(am__append_32) \
+ at ENABLE_SERVER_TRUE@	$(am__append_35) $(am__append_38)
 @ENABLE_SERVER_TRUE at libkv_a_CXXFLAGS = ${AM_CXXFLAGS} -I \
- at ENABLE_SERVER_TRUE@	rocksdb/include $(am__append_28) \
- at ENABLE_SERVER_TRUE@	$(am__append_36)
- at ENABLE_SERVER_TRUE@libkv_a_LIBADD = $(am__append_30) $(am__append_33) \
- at ENABLE_SERVER_TRUE@	$(am__append_37)
+ at ENABLE_SERVER_TRUE@	rocksdb/include $(am__append_31) \
+ at ENABLE_SERVER_TRUE@	$(am__append_39)
+ at ENABLE_SERVER_TRUE@libkv_a_LIBADD = $(am__append_33) $(am__append_36) \
+ at ENABLE_SERVER_TRUE@	$(am__append_40)
 libmon_types_la_SOURCES = \
 	mon/PGMap.cc
 
@@ -8523,7 +8735,7 @@ LIBMDS_SOURCES = \
 LIBMDS_DEPS = $(LIBOSDC)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_SOURCES = $(LIBMDS_SOURCES)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_LIBADD = $(LIBMDS_DEPS)
-libos_types_a_SOURCES = os/Transaction.cc $(am__append_46)
+libos_types_a_SOURCES = os/Transaction.cc $(am__append_49)
 libos_types_a_CXXFLAGS = ${AM_CXXFLAGS}
 @ENABLE_SERVER_TRUE at libos_a_SOURCES = os/chain_xattr.cc os/fs/FS.cc \
 @ENABLE_SERVER_TRUE@	os/DBObjectMap.cc os/GenericObjectMap.cc \
@@ -8533,10 +8745,10 @@ libos_types_a_CXXFLAGS = ${AM_CXXFLAGS}
 @ENABLE_SERVER_TRUE@	os/JournalingObjectStore.cc os/LFNIndex.cc \
 @ENABLE_SERVER_TRUE@	os/MemStore.cc os/KeyValueStore.cc \
 @ENABLE_SERVER_TRUE@	os/ObjectStore.cc os/WBThrottle.cc \
- at ENABLE_SERVER_TRUE@	$(am__append_45) $(am__append_47) \
- at ENABLE_SERVER_TRUE@	$(am__append_48) $(am__append_49)
+ at ENABLE_SERVER_TRUE@	$(am__append_48) $(am__append_50) \
+ at ENABLE_SERVER_TRUE@	$(am__append_51) $(am__append_52)
 @ENABLE_SERVER_TRUE at libos_a_LIBADD = libos_types.a libkv.a \
- at ENABLE_SERVER_TRUE@	$(am__append_50)
+ at ENABLE_SERVER_TRUE@	$(am__append_53)
 @ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at libos_zfs_a_SOURCES = os/ZFS.cc
 @ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at libos_zfs_a_CXXFLAGS = ${AM_CXXFLAGS} ${LIBZFS_CFLAGS}
 libosd_types_la_SOURCES = \
@@ -8566,10 +8778,10 @@ libosd_types_la_CXXFLAGS = ${AM_CXXFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_a_LIBADD = 
 erasure_codelibdir = $(pkglibdir)/erasure-code
 erasure_codelib_LTLIBRARIES = libec_jerasure_generic.la \
-	$(am__append_59) $(am__append_61) $(am__append_63) \
+	$(am__append_62) $(am__append_64) $(am__append_66) \
 	libec_jerasure.la libec_lrc.la libec_shec_generic.la \
-	$(am__append_68) $(am__append_70) $(am__append_72) \
-	libec_shec.la $(am__append_76) $(am__append_150)
+	$(am__append_71) $(am__append_73) $(am__append_75) \
+	libec_shec.la $(am__append_79) $(am__append_154)
 jerasure_sources = \
   erasure-code/ErasureCode.cc \
   erasure-code/jerasure/jerasure/src/cauchy.c \
@@ -8602,7 +8814,7 @@ libec_jerasure_generic_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_jerasure_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_jerasure_generic_la_LDFLAGS = ${AM_LDFLAGS} -module \
-	-avoid-version -shared $(am__append_57)
+	-avoid-version -shared $(am__append_60)
 libec_jerasure_neon_la_SOURCES = ${jerasure_sources}                                       \
                                   erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c  \
                                   erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c  \
@@ -8622,7 +8834,7 @@ libec_jerasure_neon_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_jerasure_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_jerasure_neon_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_58)
+	-shared $(am__append_61)
 libec_jerasure_sse3_la_SOURCES = ${jerasure_sources}
 libec_jerasure_sse3_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8642,7 +8854,7 @@ libec_jerasure_sse3_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_jerasure_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_jerasure_sse3_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_60)
+	-shared $(am__append_63)
 libec_jerasure_sse4_la_SOURCES = ${jerasure_sources}
 libec_jerasure_sse4_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8666,7 +8878,7 @@ libec_jerasure_sse4_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_jerasure_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_jerasure_sse4_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_62)
+	-shared $(am__append_65)
 libec_jerasure_la_SOURCES = \
 	erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc
 
@@ -8674,7 +8886,7 @@ libec_jerasure_la_CFLAGS = ${AM_CFLAGS}
 libec_jerasure_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_jerasure_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_jerasure_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_64)
+	-shared $(am__append_67)
 lrc_sources = \
   erasure-code/ErasureCode.cc \
   erasure-code/lrc/ErasureCodePluginLrc.cc \
@@ -8685,7 +8897,7 @@ libec_lrc_la_CFLAGS = ${AM_CFLAGS}
 libec_lrc_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_lrc_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(LIBJSON_SPIRIT)
 libec_lrc_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared \
-	$(am__append_65)
+	$(am__append_68)
 
 # SHEC plugin
 shec_sources = \
@@ -8726,7 +8938,7 @@ libec_shec_generic_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_shec_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_66)
+	-shared $(am__append_69)
 libec_shec_neon_la_SOURCES = ${shec_sources} \
 	erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c \
 	erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c \
@@ -8750,7 +8962,7 @@ libec_shec_neon_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_shec_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_67)
+	-shared $(am__append_70)
 libec_shec_sse3_la_SOURCES = ${shec_sources}
 libec_shec_sse3_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8774,7 +8986,7 @@ libec_shec_sse3_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_shec_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_69)
+	-shared $(am__append_72)
 libec_shec_sse4_la_SOURCES = ${shec_sources}
 libec_shec_sse4_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8802,7 +9014,7 @@ libec_shec_sse4_la_CXXFLAGS = ${AM_CXXFLAGS} \
 
 libec_shec_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
-	-shared $(am__append_71)
+	-shared $(am__append_74)
 libec_shec_la_SOURCES = \
 	erasure-code/shec/ErasureCodePluginSelectShec.cc
 
@@ -8810,7 +9022,7 @@ libec_shec_la_CFLAGS = ${AM_CFLAGS}
 libec_shec_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_shec_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared \
-	$(am__append_73)
+	$(am__append_76)
 @WITH_BETTER_YASM_ELF64_TRUE at isa_sources = \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/ec_base.c \
@@ -8866,7 +9078,7 @@ libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared \
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LDFLAGS = ${AM_LDFLAGS} \
 @WITH_BETTER_YASM_ELF64_TRUE@	-module -avoid-version -shared \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__append_75)
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__append_78)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LIBTOOLFLAGS = --tag=CC
 liberasure_code_la_SOURCES = \
 	erasure-code/ErasureCodePlugin.cc
@@ -8911,8 +9123,8 @@ liblog_la_SOURCES = \
 	log/Log.cc \
 	log/SubsystemMap.cc
 
-libperfglue_la_SOURCES = $(am__append_82) $(am__append_85) \
-	$(am__append_88) $(am__append_89) $(am__append_90)
+libperfglue_la_SOURCES = $(am__append_85) $(am__append_88) \
+	$(am__append_91) $(am__append_92) $(am__append_93)
 @WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at libperfglue_la_LIBADD = -ltcmalloc_minimal
 @WITH_TCMALLOC_TRUE at libperfglue_la_LIBADD = -ltcmalloc
 
@@ -8942,8 +9154,9 @@ libcommon_internal_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 	common/ceph_frag.cc common/addr_parsing.c common/hobject.cc \
 	common/bloom_filter.cc common/module.c common/Readahead.cc \
 	common/Cycles.cc common/ContextCompletion.cc \
-	common/TracepointProvider.cc $(am__append_91) $(am__append_92) \
-	$(am__append_93) $(am__append_94) $(am__append_95) \
+	common/TracepointProvider.cc common/PluginRegistry.cc \
+	$(am__append_94) $(am__append_95) $(am__append_96) \
+	$(am__append_97) $(am__append_98) $(am__append_99) \
 	mon/MonCap.cc mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc \
 	osd/osd_types.cc osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
 	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
@@ -8951,7 +9164,7 @@ libcommon_internal_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 # inject crc in common
 libcommon_crc_la_SOURCES = common/sctp_crc32.c common/crc32c.cc \
 	common/crc32c_intel_baseline.c common/crc32c_intel_fast.c \
-	$(am__append_96)
+	$(am__append_100)
 @WITH_GOOD_YASM_ELF64_TRUE at libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
 @HAVE_ARMV8_CRC_TRUE at libcommon_crc_aarch64_la_SOURCES = common/crc32c_aarch64.c
 @HAVE_ARMV8_CRC_TRUE at libcommon_crc_aarch64_la_CFLAGS = $(AM_CFLAGS) $(ARM_CRC_FLAGS)
@@ -8965,9 +9178,9 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 	msg/simple/SimpleMessenger.cc msg/async/AsyncConnection.cc \
 	msg/async/AsyncMessenger.cc msg/async/Event.cc \
 	msg/async/net_handler.cc msg/async/EventSelect.cc \
-	$(am__append_102) $(am__append_103) $(am__append_104) \
-	$(am__append_105) $(am__append_106) $(am__append_107) \
-	$(am__append_108)
+	$(am__append_106) $(am__append_107) $(am__append_108) \
+	$(am__append_109) $(am__append_110) $(am__append_111) \
+	$(am__append_112)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_includedir = $(includedir)/rados
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_include_DATA = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/rados/librados.h \
@@ -8975,6 +9188,7 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/rados/rados_types.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/rados/librados.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/buffer.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/buffer_fwd.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/page.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/crc32c.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/memory.h
@@ -9010,12 +9224,12 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 # We need this to avoid basename conflicts with the librados build tests in test/Makefile.am
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_CXXFLAGS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	${AM_CXXFLAGS} \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_112)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_116)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_LIBADD = $(LIBRADOS_DEPS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_LDFLAGS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	-version-info 2:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_113)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_117)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/libradosstriper.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/RadosStriperImpl.cc \
@@ -9029,7 +9243,7 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_LDFLAGS = ${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	-version-info \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	1:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__append_116)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__append_120)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at libjournal_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/AsyncOpTracker.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Entry.cc \
@@ -9053,15 +9267,14 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequestWQ.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioObjectRequest.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncObjectThrottle.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncOperation.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncRequest.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncResizeRequest.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncTrimRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/CopyupRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/DiffIterate.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ExclusiveLock.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageState.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.cc \
@@ -9069,7 +9282,36 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Utils.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/AcquireRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/exclusive_lock/ReleaseRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/CloseRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/OpenRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshParentRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/RefreshRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/image/SetSnapRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/InvalidateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/LockRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/Request.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/RefreshRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/ResizeRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotCreateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotRemoveRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/SnapshotRollbackRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/UnlockRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/object_map/UpdateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/FlattenRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/RebuildObjectMapRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/RenameRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/Request.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/ResizeRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotCreateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotProtectRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRemoveRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRenameRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotRollbackRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/SnapshotUnprotectRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/operation/TrimRequest.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_api_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/librbd.cc
@@ -9089,7 +9331,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_LDFLAGS = ${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	-version-info \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	1:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_122)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_126)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at librgw_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw.cc \
@@ -9109,6 +9351,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_rest_client.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_rest_conn.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_op.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_basic_types.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_common.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_cache.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_formats.cc \
@@ -9198,7 +9441,10 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@	cls/rgw/cls_rgw_types.cc \
 @ENABLE_CLIENT_TRUE@	cls/rgw/cls_rgw_ops.cc
 
- at ENABLE_CLIENT_TRUE@libcls_rbd_client_la_SOURCES = cls/rbd/cls_rbd_client.cc
+ at ENABLE_CLIENT_TRUE@libcls_rbd_client_la_SOURCES = \
+ at ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_client.cc \
+ at ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_types.cc
+
 @ENABLE_CLIENT_TRUE at libcls_user_client_a_SOURCES = cls/user/cls_user_client.cc \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_types.cc \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_ops.cc
@@ -9214,7 +9460,10 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_numops_la_SOURCES = cls/numops/cls_numops.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_numops_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rbd/cls_rbd.cc \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rbd/cls_rbd_types.cc
+
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_lock_la_SOURCES = cls/lock/cls_lock.cc
@@ -9323,7 +9572,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_145)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_149)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_non_regression_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code_non_regression.cc
 
@@ -9331,7 +9580,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_147)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_151)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code.cc
 
@@ -9339,7 +9588,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_149)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_153)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ErasureCodePluginExample.cc
@@ -9402,7 +9651,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_151)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_155)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCode.cc
@@ -9425,7 +9674,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_153)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_157)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_jerasure_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginJerasure.cc
 
@@ -9434,7 +9683,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_154)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_158)
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_isa_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeIsa.cc
@@ -9446,7 +9695,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_155)
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_159)
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_isa_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginIsa.cc
@@ -9458,7 +9707,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_157)
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_161)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_lrc_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeLrc.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${lrc_sources}
@@ -9468,7 +9717,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_158)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_162)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_lrc_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginLrc.cc
 
@@ -9477,7 +9726,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_160)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_164)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9498,7 +9747,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_161)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_165)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_all.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9519,7 +9768,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_162)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_166)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_thread.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9540,7 +9789,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_163)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_167)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_arguments.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9561,7 +9810,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_164)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_168)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@        test/erasure-code/TestErasureCodePluginShec.cc
 
@@ -9570,7 +9819,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_165)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_169)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_SOURCES = test/erasure-code/TestShecPluginNEON.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_CFLAGS = ${AM_CFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_CXXFLAGS = ${AM_CXXFLAGS}
@@ -9608,7 +9857,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_167)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_171)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at simple_client_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_dispatcher.cc
@@ -9620,7 +9869,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_168)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_172)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_server_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_server.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.cc
@@ -9632,7 +9881,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_170)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_174)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_client_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.cc
@@ -9644,7 +9893,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_171)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_175)
 
 # This should use LIBMDS_TYPES once it exists
 @ENABLE_CLIENT_TRUE at ceph_dencoder_SOURCES = \
@@ -9662,9 +9911,9 @@ librbd_types_la_SOURCES = \
 
 # These should always use explicit _CFLAGS/_CXXFLAGS so avoid basename conflicts
 @ENABLE_CLIENT_TRUE at ceph_dencoder_CFLAGS = ${AM_CFLAGS} \
- at ENABLE_CLIENT_TRUE@	$(am__append_172)
+ at ENABLE_CLIENT_TRUE@	$(am__append_176)
 @ENABLE_CLIENT_TRUE at ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS} \
- at ENABLE_CLIENT_TRUE@	$(am__append_173)
+ at ENABLE_CLIENT_TRUE@	$(am__append_177)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at libradostest_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados/test.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados/TestCase.cc
@@ -9910,7 +10159,23 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        test/librbd/test_main.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_ExclusiveLock.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/test_mock_AcquireRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_InvalidateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_LockRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_RefreshRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_ResizeRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_SnapshotCreateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_UnlockRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/object_map/test_mock_UpdateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/test_mock_SnapshotCreateRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/test_mock_SnapshotProtectRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/test_mock_SnapshotRemoveRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_LDADD = \
@@ -9995,7 +10260,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/caps.cc \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/multiclient.cc \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/access.cc \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_194)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_198)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_LDADD = $(LIBRADOS) $(LIBCEPHFS) $(LIBCOMMON) $(UNITTEST_LDADD)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_encoding_SOURCES = test/encoding.cc
@@ -10018,7 +10283,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wignored-qualifiers \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wold-style-definition \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wtype-limits \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_196)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_200)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(librgw_la_SOURCES)
@@ -10216,13 +10481,13 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_osdscrub_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_208)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_212)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_SOURCES = test/osd/TestPGLog.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_LDADD = $(LIBOSD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_209)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_213)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_SOURCES = test/osd/hitset.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -10289,7 +10554,7 @@ UNITTEST_CXXFLAGS = \
 UNITTEST_LDADD = $(top_builddir)/src/gmock/lib/libgmock_main.la \
 	$(top_builddir)/src/gmock/lib/libgmock.la \
 	$(top_builddir)/src/gmock/gtest/lib/libgtest.la \
-	$(PTHREAD_LIBS) $(am__append_217)
+	$(PTHREAD_LIBS) $(am__append_221)
 unittest_addrs_SOURCES = test/test_addrs.cc
 unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -10465,8 +10730,8 @@ ceph_test_objectcacher_stress_SOURCES = \
 
 ceph_test_objectcacher_stress_LDADD = $(LIBOSDC) $(CEPH_GLOBAL)
 ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
-ceph_test_get_blkdev_size_SOURCES = test/test_get_blkdev_size.cc
-ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
+ at LINUX_TRUE@ceph_test_get_blkdev_size_SOURCES = test/test_get_blkdev_size.cc
+ at LINUX_TRUE@ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_scratchtool_SOURCES = tools/scratchtool.c
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_scratchtool_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_scratchtoolpp_SOURCES = tools/scratchtoolpp.cc
@@ -10504,10 +10769,13 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Import.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ImportDiff.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Info.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Journal.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Kernel.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Nbd.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/List.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Lock.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/MergeDiff.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/MirrorPool.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ObjectMap.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Remove.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Rename.cc \
@@ -10517,9 +10785,13 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Watch.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_LDADD = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBKRBD) $(LIBRBD) $(LIBRBD_TYPES) $(LIBRADOS) $(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(BOOST_REGEX_LIBS) $(BOOST_PROGRAM_OPTIONS_LIBS)
 
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_nbd_SOURCES = tools/rbd_nbd/rbd-nbd.cc
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_nbd_CXXFLAGS = $(AM_CXXFLAGS)
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_nbd_LDADD = $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) $(BOOST_REGEX_LIBS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_SOURCES = tools/ceph-client-debug.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_LDADD = $(LIBCEPHFS) $(LIBCLIENT) $(CEPH_GLOBAL) $(LIBCOMMON)
 @ENABLE_SERVER_TRUE at ceph_osdomap_tool_SOURCES = tools/ceph_osdomap_tool.cc
@@ -10536,7 +10808,7 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBOS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_225)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_230)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_journal_tool_SOURCES = \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/cephfs-journal-tool.cc \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/JournalTool.cc \
@@ -10617,6 +10889,13 @@ librbd_tp_la_LDFLAGS = -version-info 1:0:0
 libos_tp_la_LIBADD = -llttng-ust -ldl
 libos_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
 libos_tp_la_LDFLAGS = -version-info 1:0:0
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at PY_DISTUTILS = \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	CPPFLAGS="-iquote \${abs_srcdir}/include ${AM_CPPFLAGS} ${CPPFLAGS}" \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	CFLAGS="-iquote \${abs_srcdir}/include ${AM_CFLAGS} ${CFLAGS}" \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	LDFLAGS="-L\${abs_builddir}/.libs $(subst -pie,,${AM_LDFLAGS}) ${LDFLAGS}" \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	CYTHON_BUILD_DIR="$(shell readlink -f $(builddir))/build" \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	${PYTHON} ./setup.py
+
 
 # subdirs
 
@@ -10631,7 +10910,7 @@ editpaths = sed \
 	-e 's|@@GCOV_PREFIX_STRIP[@][@]|$(GCOV_PREFIX_STRIP)|g'
 
 shell_scripts = ceph-debugpack ceph-post-file ceph-crush-location \
-	$(am__append_248)
+	$(am__append_255)
 doc_DATA = $(srcdir)/sample.ceph.conf sample.fetch_config
 
 # various scripts
@@ -10646,12 +10925,12 @@ ceph_libexec_SCRIPTS = ceph-osd-prestart.sh
 @WITH_LTTNG_TRUE at TESTS_ENVIRONMENT = LD_PRELOAD=liblttng-ust-fork.so; export LD_PRELOAD; echo "LD_PRELOAD=$${LD_PRELOAD}";
 
 # pybind
-python_PYTHON = $(am__append_230) $(am__append_233) $(am__append_237) \
-	$(am__append_242) $(am__append_247)
+python_PYTHON = $(am__append_238) $(am__append_241) $(am__append_249) \
+	$(am__append_254)
 @ENABLE_CLIENT_TRUE at bash_completiondir = $(sysconfdir)/bash_completion.d
 @ENABLE_CLIENT_TRUE at bash_completion_DATA =  \
 @ENABLE_CLIENT_TRUE@	$(srcdir)/bash_completion/ceph \
- at ENABLE_CLIENT_TRUE@	$(am__append_232) $(am__append_235)
+ at ENABLE_CLIENT_TRUE@	$(am__append_240) $(am__append_243)
 @ENABLE_CLIENT_TRUE at ceph_syn_SOURCES = ceph_syn.cc \
 @ENABLE_CLIENT_TRUE@	client/SyntheticClient.cc # uses g_conf.. \
 @ENABLE_CLIENT_TRUE@	needs cleanup
@@ -10678,7 +10957,7 @@ python_PYTHON = $(am__append_230) $(am__append_233) $(am__append_237) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	1:0:0 \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-export-symbols-regex \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	'^ceph_.*' \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_243)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_250)
 
 # jni library (java source is in src/java)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_SOURCES = \
@@ -10691,7 +10970,7 @@ python_PYTHON = $(am__append_230) $(am__append_233) $(am__append_237) \
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 @ENABLE_SERVER_TRUE at ceph_sbin_SCRIPTS = ceph-create-keys \
- at ENABLE_SERVER_TRUE@	$(am__append_253)
+ at ENABLE_SERVER_TRUE@	$(am__append_260)
 @ENABLE_SERVER_TRUE at mount_ceph_SOURCES = mount/mount.ceph.c
 @ENABLE_SERVER_TRUE at mount_ceph_LDADD = $(LIBSECRET) $(LIBCOMMON)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_mon_SOURCES = ceph_mon.cc
@@ -10701,7 +10980,7 @@ python_PYTHON = $(am__append_230) $(am__append_233) $(am__append_237) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBOSD_TYPES) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES) $(LIBOS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) $(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_255)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_262)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at ceph_mds_SOURCES = ceph_mds.cc
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at ceph_mds_LDADD = $(LIBMDS) $(LIBOSDC) $(CEPH_GLOBAL) $(LIBCOMMON)
 @ENABLE_COVERAGE_TRUE@@ENABLE_SERVER_TRUE at COV_DIR = $(DESTDIR)$(libdir)/ceph/coverage
@@ -10712,7 +10991,7 @@ all: $(BUILT_SOURCES) acconfig.h
 
 .SUFFIXES:
 .SUFFIXES: .S .c .cc .cpp .lo .log .o .obj .s .test .test$(EXEEXT) .trs
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am [...]
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am [...]
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -10733,7 +11012,7 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
 	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
 	esac;
-$(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcd [...]
+$(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcd [...]
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
@@ -11392,6 +11671,8 @@ cls/rbd/$(DEPDIR)/$(am__dirstamp):
 	@: > cls/rbd/$(DEPDIR)/$(am__dirstamp)
 cls/rbd/cls_rbd.lo: cls/rbd/$(am__dirstamp) \
 	cls/rbd/$(DEPDIR)/$(am__dirstamp)
+cls/rbd/cls_rbd_types.lo: cls/rbd/$(am__dirstamp) \
+	cls/rbd/$(DEPDIR)/$(am__dirstamp)
 
 libcls_rbd.la: $(libcls_rbd_la_OBJECTS) $(libcls_rbd_la_DEPENDENCIES) $(EXTRA_libcls_rbd_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcls_rbd_la_LINK) $(am_libcls_rbd_la_rpath) $(libcls_rbd_la_OBJECTS) $(libcls_rbd_la_LIBADD) $(LIBS)
@@ -11632,6 +11913,8 @@ common/ContextCompletion.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/TracepointProvider.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
+common/PluginRegistry.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 common/xattr.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/ipaddr.lo: common/$(am__dirstamp) \
@@ -11644,6 +11927,8 @@ common/linux_version.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/solaris_errno.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
+common/aix_errno.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 common/blkdev.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/address_helper.lo: common/$(am__dirstamp) \
@@ -12936,24 +13221,22 @@ librbd/AioImageRequestWQ.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/AioObjectRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
-librbd/AsyncFlattenRequest.lo: librbd/$(am__dirstamp) \
-	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/AsyncObjectThrottle.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/AsyncOperation.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/AsyncRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
-librbd/AsyncResizeRequest.lo: librbd/$(am__dirstamp) \
-	librbd/$(DEPDIR)/$(am__dirstamp)
-librbd/AsyncTrimRequest.lo: librbd/$(am__dirstamp) \
-	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/CopyupRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/DiffIterate.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/ExclusiveLock.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ImageCtx.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/ImageState.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ImageWatcher.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/internal.lo: librbd/$(am__dirstamp) \
@@ -12968,8 +13251,104 @@ librbd/LibrbdWriteback.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ObjectMap.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
-librbd/RebuildObjectMapRequest.lo: librbd/$(am__dirstamp) \
+librbd/Utils.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/exclusive_lock/$(am__dirstamp):
+	@$(MKDIR_P) librbd/exclusive_lock
+	@: > librbd/exclusive_lock/$(am__dirstamp)
+librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) librbd/exclusive_lock/$(DEPDIR)
+	@: > librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+librbd/exclusive_lock/AcquireRequest.lo:  \
+	librbd/exclusive_lock/$(am__dirstamp) \
+	librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+librbd/exclusive_lock/ReleaseRequest.lo:  \
+	librbd/exclusive_lock/$(am__dirstamp) \
+	librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+librbd/image/$(am__dirstamp):
+	@$(MKDIR_P) librbd/image
+	@: > librbd/image/$(am__dirstamp)
+librbd/image/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) librbd/image/$(DEPDIR)
+	@: > librbd/image/$(DEPDIR)/$(am__dirstamp)
+librbd/image/CloseRequest.lo: librbd/image/$(am__dirstamp) \
+	librbd/image/$(DEPDIR)/$(am__dirstamp)
+librbd/image/OpenRequest.lo: librbd/image/$(am__dirstamp) \
+	librbd/image/$(DEPDIR)/$(am__dirstamp)
+librbd/image/RefreshParentRequest.lo: librbd/image/$(am__dirstamp) \
+	librbd/image/$(DEPDIR)/$(am__dirstamp)
+librbd/image/RefreshRequest.lo: librbd/image/$(am__dirstamp) \
+	librbd/image/$(DEPDIR)/$(am__dirstamp)
+librbd/image/SetSnapRequest.lo: librbd/image/$(am__dirstamp) \
+	librbd/image/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/$(am__dirstamp):
+	@$(MKDIR_P) librbd/object_map
+	@: > librbd/object_map/$(am__dirstamp)
+librbd/object_map/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) librbd/object_map/$(DEPDIR)
+	@: > librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/InvalidateRequest.lo:  \
+	librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/LockRequest.lo: librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/Request.lo: librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/RefreshRequest.lo:  \
+	librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/ResizeRequest.lo: librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/SnapshotCreateRequest.lo:  \
+	librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/SnapshotRemoveRequest.lo:  \
+	librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/SnapshotRollbackRequest.lo:  \
+	librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/UnlockRequest.lo: librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/object_map/UpdateRequest.lo: librbd/object_map/$(am__dirstamp) \
+	librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/$(am__dirstamp):
+	@$(MKDIR_P) librbd/operation
+	@: > librbd/operation/$(am__dirstamp)
+librbd/operation/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) librbd/operation/$(DEPDIR)
+	@: > librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/FlattenRequest.lo: librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/RebuildObjectMapRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/RenameRequest.lo: librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/Request.lo: librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/ResizeRequest.lo: librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/SnapshotCreateRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/SnapshotProtectRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/SnapshotRemoveRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/SnapshotRenameRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/SnapshotRollbackRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/SnapshotUnprotectRequest.lo:  \
+	librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
+librbd/operation/TrimRequest.lo: librbd/operation/$(am__dirstamp) \
+	librbd/operation/$(DEPDIR)/$(am__dirstamp)
 
 librbd_internal.la: $(librbd_internal_la_OBJECTS) $(librbd_internal_la_DEPENDENCIES) $(EXTRA_librbd_internal_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_librbd_internal_la_rpath) $(librbd_internal_la_OBJECTS) $(librbd_internal_la_LIBADD) $(LIBS)
@@ -13083,6 +13462,8 @@ rgw/librgw_la-rgw_rest_conn.lo: rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/librgw_la-rgw_op.lo: rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/librgw_la-rgw_basic_types.lo: rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/librgw_la-rgw_common.lo: rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/librgw_la-rgw_cache.lo: rgw/$(am__dirstamp) \
@@ -13412,6 +13793,8 @@ rgw/ceph_dencoder-rgw_dencoder.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/ceph_dencoder-rgw_acl.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/ceph_dencoder-rgw_basic_types.$(OBJEXT): rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/ceph_dencoder-rgw_common.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/ceph_dencoder-rgw_env.$(OBJEXT): rgw/$(am__dirstamp) \
@@ -14553,8 +14936,12 @@ tools/rbd/action/ImportDiff.$(OBJEXT):  \
 	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 tools/rbd/action/Info.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
 	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Journal.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 tools/rbd/action/Kernel.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
 	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Nbd.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 tools/rbd/action/List.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
 	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 tools/rbd/action/Lock.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
@@ -14562,6 +14949,9 @@ tools/rbd/action/Lock.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
 tools/rbd/action/MergeDiff.$(OBJEXT):  \
 	tools/rbd/action/$(am__dirstamp) \
 	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/MirrorPool.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 tools/rbd/action/ObjectMap.$(OBJEXT):  \
 	tools/rbd/action/$(am__dirstamp) \
 	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
@@ -14593,6 +14983,19 @@ rbd_fuse/rbd_fuse-rbd-fuse.$(OBJEXT): rbd_fuse/$(am__dirstamp) \
 rbd-fuse$(EXEEXT): $(rbd_fuse_OBJECTS) $(rbd_fuse_DEPENDENCIES) $(EXTRA_rbd_fuse_DEPENDENCIES) 
 	@rm -f rbd-fuse$(EXEEXT)
 	$(AM_V_CXXLD)$(rbd_fuse_LINK) $(rbd_fuse_OBJECTS) $(rbd_fuse_LDADD) $(LIBS)
+tools/rbd_nbd/$(am__dirstamp):
+	@$(MKDIR_P) tools/rbd_nbd
+	@: > tools/rbd_nbd/$(am__dirstamp)
+tools/rbd_nbd/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) tools/rbd_nbd/$(DEPDIR)
+	@: > tools/rbd_nbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd_nbd/rbd_nbd-rbd-nbd.$(OBJEXT):  \
+	tools/rbd_nbd/$(am__dirstamp) \
+	tools/rbd_nbd/$(DEPDIR)/$(am__dirstamp)
+
+rbd-nbd$(EXEEXT): $(rbd_nbd_OBJECTS) $(rbd_nbd_DEPENDENCIES) $(EXTRA_rbd_nbd_DEPENDENCIES) 
+	@rm -f rbd-nbd$(EXEEXT)
+	$(AM_V_CXXLD)$(rbd_nbd_LINK) $(rbd_nbd_OBJECTS) $(rbd_nbd_LDADD) $(LIBS)
 rbd_replay/rbd-replay.$(OBJEXT): rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
 
@@ -14701,6 +15104,8 @@ rgw/test_build_librgw-rgw_rest_conn.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/test_build_librgw-rgw_op.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/test_build_librgw-rgw_basic_types.$(OBJEXT): rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/test_build_librgw-rgw_common.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/test_build_librgw-rgw_cache.$(OBJEXT): rgw/$(am__dirstamp) \
@@ -15464,6 +15869,72 @@ test/librbd/unittest_librbd-test_main.$(OBJEXT):  \
 test/librbd/unittest_librbd-test_mock_fixture.$(OBJEXT):  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/unittest_librbd-test_mock_ExclusiveLock.$(OBJEXT):  \
+	test/librbd/$(am__dirstamp) \
+	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/exclusive_lock/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/exclusive_lock
+	@: > test/librbd/exclusive_lock/$(am__dirstamp)
+test/librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/exclusive_lock/$(DEPDIR)
+	@: > test/librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.$(OBJEXT):  \
+	test/librbd/exclusive_lock/$(am__dirstamp) \
+	test/librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.$(OBJEXT):  \
+	test/librbd/exclusive_lock/$(am__dirstamp) \
+	test/librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/object_map
+	@: > test/librbd/object_map/$(am__dirstamp)
+test/librbd/object_map/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/object_map/$(DEPDIR)
+	@: > test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_LockRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.$(OBJEXT):  \
+	test/librbd/object_map/$(am__dirstamp) \
+	test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+test/librbd/operation/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/operation
+	@: > test/librbd/operation/$(am__dirstamp)
+test/librbd/operation/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/librbd/operation/$(DEPDIR)
+	@: > test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
+test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.$(OBJEXT):  \
+	test/librbd/operation/$(am__dirstamp) \
+	test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
+test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.$(OBJEXT):  \
+	test/librbd/operation/$(am__dirstamp) \
+	test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
+test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.$(OBJEXT):  \
+	test/librbd/operation/$(am__dirstamp) \
+	test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
+test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.$(OBJEXT):  \
+	test/librbd/operation/$(am__dirstamp) \
+	test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
 
 unittest_librbd$(EXEEXT): $(unittest_librbd_OBJECTS) $(unittest_librbd_DEPENDENCIES) $(EXTRA_unittest_librbd_DEPENDENCIES) 
 	@rm -f unittest_librbd$(EXEEXT)
@@ -16119,6 +16590,14 @@ mostlyclean-compile:
 	-rm -f libradosstriper/*.lo
 	-rm -f librbd/*.$(OBJEXT)
 	-rm -f librbd/*.lo
+	-rm -f librbd/exclusive_lock/*.$(OBJEXT)
+	-rm -f librbd/exclusive_lock/*.lo
+	-rm -f librbd/image/*.$(OBJEXT)
+	-rm -f librbd/image/*.lo
+	-rm -f librbd/object_map/*.$(OBJEXT)
+	-rm -f librbd/object_map/*.lo
+	-rm -f librbd/operation/*.$(OBJEXT)
+	-rm -f librbd/operation/*.lo
 	-rm -f log/*.$(OBJEXT)
 	-rm -f log/*.lo
 	-rm -f mds/*.$(OBJEXT)
@@ -16180,6 +16659,9 @@ mostlyclean-compile:
 	-rm -f test/libradosstriper/*.lo
 	-rm -f test/librbd/*.$(OBJEXT)
 	-rm -f test/librbd/*.lo
+	-rm -f test/librbd/exclusive_lock/*.$(OBJEXT)
+	-rm -f test/librbd/object_map/*.$(OBJEXT)
+	-rm -f test/librbd/operation/*.$(OBJEXT)
 	-rm -f test/mds/*.$(OBJEXT)
 	-rm -f test/messenger/*.$(OBJEXT)
 	-rm -f test/mon/*.$(OBJEXT)
@@ -16196,6 +16678,7 @@ mostlyclean-compile:
 	-rm -f tools/rados/*.$(OBJEXT)
 	-rm -f tools/rbd/*.$(OBJEXT)
 	-rm -f tools/rbd/action/*.$(OBJEXT)
+	-rm -f tools/rbd_nbd/*.$(OBJEXT)
 	-rm -f tracing/*.$(OBJEXT)
 	-rm -f tracing/*.lo
 
@@ -16258,6 +16741,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at cls/numops/$(DEPDIR)/cls_numops_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/rbd/$(DEPDIR)/cls_rbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/rbd/$(DEPDIR)/cls_rbd_client.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/rbd/$(DEPDIR)/cls_rbd_types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/refcount/$(DEPDIR)/cls_refcount.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/refcount/$(DEPDIR)/cls_refcount_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/refcount/$(DEPDIR)/cls_refcount_ops.Plo at am__quote@
@@ -16294,6 +16778,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/MemoryModel.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/Mutex.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/OutputDataSocket.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/PluginRegistry.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/PrebufferedStreambuf.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/Readahead.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/RefCountedObj.Plo at am__quote@
@@ -16309,6 +16794,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/address_helper.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/admin_socket.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/admin_socket_client.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/aix_errno.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/armor.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/assert.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/blkdev.Plo at am__quote@
@@ -16718,15 +17204,14 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioImageRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioImageRequestWQ.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioObjectRequest.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncFlattenRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncObjectThrottle.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncOperation.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncRequest.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncResizeRequest.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncTrimRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/CopyupRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/DiffIterate.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ExclusiveLock.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageCtx.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageState.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageWatcher.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/Journal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/JournalReplay.Plo at am__quote@
@@ -16734,11 +17219,40 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdAdminSocketHook.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdWriteback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ObjectMap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/RebuildObjectMapRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/Utils.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/WatchNotifyTypes.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/internal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/librbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/librbd_la-librbd.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/exclusive_lock/$(DEPDIR)/AcquireRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/exclusive_lock/$(DEPDIR)/ReleaseRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/CloseRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/OpenRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/RefreshParentRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/RefreshRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/image/$(DEPDIR)/SetSnapRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/InvalidateRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/LockRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/RefreshRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/Request.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/ResizeRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/SnapshotCreateRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/SnapshotRemoveRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/SnapshotRollbackRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/UnlockRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/object_map/$(DEPDIR)/UpdateRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/FlattenRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/RebuildObjectMapRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/RenameRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/Request.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/ResizeRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/SnapshotCreateRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/SnapshotProtectRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/SnapshotRemoveRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/SnapshotRenameRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/SnapshotRollbackRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/SnapshotUnprotectRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/operation/$(DEPDIR)/TrimRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at log/$(DEPDIR)/Log.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at log/$(DEPDIR)/SubsystemMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at log/$(DEPDIR)/unittest_log-test.Po at am__quote@
@@ -16923,6 +17437,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/rbd-replay.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/rbd_loc.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_acl.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_common.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_dencoder.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_env.Po at am__quote@
@@ -16934,6 +17449,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_acl_s3.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_acl_swift.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_auth_s3.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_basic_types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_bucket.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_cache.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_client_io.Plo at am__quote@
@@ -16995,6 +17511,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_acl_s3.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_acl_swift.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_auth_s3.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_bucket.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_cache.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_client_io.Po at am__quote@
@@ -17238,7 +17755,23 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_librbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_support.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_main.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/mds/$(DEPDIR)/unittest_mds_authcap-TestMDSAuthCaps.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/messenger/$(DEPDIR)/simple_client-simple_client.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/messenger/$(DEPDIR)/simple_client-simple_dispatcher.Po at am__quote@
@@ -17349,10 +17882,13 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Import.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/ImportDiff.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Info.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Journal.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Kernel.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/List.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Lock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/MergeDiff.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/MirrorPool.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Nbd.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/ObjectMap.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Remove.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Rename.Po at am__quote@
@@ -17360,6 +17896,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Snap.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Status.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Watch.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libos_tp_la-objectstore.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-oprequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-osd.Plo at am__quote@
@@ -20729,6 +21266,13 @@ rgw/librgw_la-rgw_op.lo: rgw/rgw_op.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/librgw_la-rgw_op.lo `test -f 'rgw/rgw_op.cc' || echo '$(srcdir)/'`rgw/rgw_op.cc
 
+rgw/librgw_la-rgw_basic_types.lo: rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -MT rgw/librgw_la-rgw_basic_types.lo -MD -MP -MF rgw/$(DEPDIR)/librgw_la-rgw_basic_types.Tpo -c -o rgw/librgw_la-rgw_basic_types.lo `test -f 'rgw/rgw_basic_types.cc' || echo '$(srcdir)/'`rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/librgw_la-rgw_basic_types.Tpo rgw/$(DEPDIR)/librgw_la-rgw_basic_types.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_basic_types.cc' object='rgw/librgw_la-rgw_basic_types.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/librgw_la-rgw_basic_types.lo `test -f 'rgw/rgw_basic_types.cc' || echo '$(srcdir)/'`rgw/rgw_basic_types.cc
+
 rgw/librgw_la-rgw_common.lo: rgw/rgw_common.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -MT rgw/librgw_la-rgw_common.lo -MD -MP -MF rgw/$(DEPDIR)/librgw_la-rgw_common.Tpo -c -o rgw/librgw_la-rgw_common.lo `test -f 'rgw/rgw_common.cc' || echo '$(srcdir)/'`rgw/rgw_common.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/librgw_la-rgw_common.Tpo rgw/$(DEPDIR)/librgw_la-rgw_common.Plo
@@ -21359,6 +21903,20 @@ rgw/ceph_dencoder-rgw_acl.obj: rgw/rgw_acl.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o rgw/ceph_dencoder-rgw_acl.obj `if test -f 'rgw/rgw_acl.cc'; then $(CYGPATH_W) 'rgw/rgw_acl.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_acl.cc'; fi`
 
+rgw/ceph_dencoder-rgw_basic_types.o: rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT rgw/ceph_dencoder-rgw_basic_types.o -MD -MP -MF rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Tpo -c -o rgw/ceph_dencoder-rgw_basic_types.o `test -f 'rgw/rgw_basic_types.cc' || echo '$(srcdir)/'`rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Tpo rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_basic_types.cc' object='rgw/ceph_dencoder-rgw_basic_types.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o rgw/ceph_dencoder-rgw_basic_types.o `test -f 'rgw/rgw_basic_types.cc' || echo '$(srcdir)/'`rgw/rgw_basic_types.cc
+
+rgw/ceph_dencoder-rgw_basic_types.obj: rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT rgw/ceph_dencoder-rgw_basic_types.obj -MD -MP -MF rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Tpo -c -o rgw/ceph_dencoder-rgw_basic_types.obj `if test -f 'rgw/rgw_basic_types.cc'; then $(CYGPATH_W) 'rgw/rgw_basic_types.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_basic_types.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Tpo rgw/$(DEPDIR)/ceph_dencoder-rgw_basic_types.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_basic_types.cc' object='rgw/ceph_dencoder-rgw_basic_types.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o rgw/ceph_dencoder-rgw_basic_types.obj `if test -f 'rgw/rgw_basic_types.cc'; then $(CYGPATH_W) 'rgw/rgw_basic_types.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_basic_types.cc'; fi`
+
 rgw/ceph_dencoder-rgw_common.o: rgw/rgw_common.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT rgw/ceph_dencoder-rgw_common.o -MD -MP -MF rgw/$(DEPDIR)/ceph_dencoder-rgw_common.Tpo -c -o rgw/ceph_dencoder-rgw_common.o `test -f 'rgw/rgw_common.cc' || echo '$(srcdir)/'`rgw/rgw_common.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/ceph_dencoder-rgw_common.Tpo rgw/$(DEPDIR)/ceph_dencoder-rgw_common.Po
@@ -22367,6 +22925,20 @@ rbd_fuse/rbd_fuse-rbd-fuse.obj: rbd_fuse/rbd-fuse.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_fuse_CXXFLAGS) $(CXXFLAGS) -c -o rbd_fuse/rbd_fuse-rbd-fuse.obj `if test -f 'rbd_fuse/rbd-fuse.cc'; then $(CYGPATH_W) 'rbd_fuse/rbd-fuse.cc'; else $(CYGPATH_W) '$(srcdir)/rbd_fuse/rbd-fuse.cc'; fi`
 
+tools/rbd_nbd/rbd_nbd-rbd-nbd.o: tools/rbd_nbd/rbd-nbd.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_nbd_CXXFLAGS) $(CXXFLAGS) -MT tools/rbd_nbd/rbd_nbd-rbd-nbd.o -MD -MP -MF tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Tpo -c -o tools/rbd_nbd/rbd_nbd-rbd-nbd.o `test -f 'tools/rbd_nbd/rbd-nbd.cc' || echo '$(srcdir)/'`tools/rbd_nbd/rbd-nbd.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Tpo tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tools/rbd_nbd/rbd-nbd.cc' object='tools/rbd_nbd/rbd_nbd-rbd-nbd.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_nbd_CXXFLAGS) $(CXXFLAGS) -c -o tools/rbd_nbd/rbd_nbd-rbd-nbd.o `test -f 'tools/rbd_nbd/rbd-nbd.cc' || echo '$(srcdir)/'`tools/rbd_nbd/rbd-nbd.cc
+
+tools/rbd_nbd/rbd_nbd-rbd-nbd.obj: tools/rbd_nbd/rbd-nbd.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_nbd_CXXFLAGS) $(CXXFLAGS) -MT tools/rbd_nbd/rbd_nbd-rbd-nbd.obj -MD -MP -MF tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Tpo -c -o tools/rbd_nbd/rbd_nbd-rbd-nbd.obj `if test -f 'tools/rbd_nbd/rbd-nbd.cc'; then $(CYGPATH_W) 'tools/rbd_nbd/rbd-nbd.cc'; else $(CYGPATH_W) '$(srcdir)/tools/rbd_nbd/rbd-nbd.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Tpo tools/rbd_nbd/$(DEPDIR)/rbd_nbd-rbd-nbd.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tools/rbd_nbd/rbd-nbd.cc' object='tools/rbd_nbd/rbd_nbd-rbd-nbd.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_nbd_CXXFLAGS) $(CXXFLAGS) -c -o tools/rbd_nbd/rbd_nbd-rbd-nbd.obj `if test -f 'tools/rbd_nbd/rbd-nbd.cc'; then $(CYGPATH_W) 'tools/rbd_nbd/rbd-nbd.cc'; else $(CYGPATH_W) '$(srcdir)/tools/rbd_nbd/rbd-nbd.cc'; fi`
+
 test/messenger/simple_client-simple_client.o: test/messenger/simple_client.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(simple_client_CXXFLAGS) $(CXXFLAGS) -MT test/messenger/simple_client-simple_client.o -MD -MP -MF test/messenger/$(DEPDIR)/simple_client-simple_client.Tpo -c -o test/messenger/simple_client-simple_client.o `test -f 'test/messenger/simple_client.cc' || echo '$(srcdir)/'`test/messenger/simple_client.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/messenger/$(DEPDIR)/simple_client-simple_client.Tpo test/messenger/$(DEPDIR)/simple_client-simple_client.Po
@@ -22829,6 +23401,20 @@ rgw/test_build_librgw-rgw_op.obj: rgw/rgw_op.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -c -o rgw/test_build_librgw-rgw_op.obj `if test -f 'rgw/rgw_op.cc'; then $(CYGPATH_W) 'rgw/rgw_op.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_op.cc'; fi`
 
+rgw/test_build_librgw-rgw_basic_types.o: rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -MT rgw/test_build_librgw-rgw_basic_types.o -MD -MP -MF rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Tpo -c -o rgw/test_build_librgw-rgw_basic_types.o `test -f 'rgw/rgw_basic_types.cc' || echo '$(srcdir)/'`rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Tpo rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_basic_types.cc' object='rgw/test_build_librgw-rgw_basic_types.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -c -o rgw/test_build_librgw-rgw_basic_types.o `test -f 'rgw/rgw_basic_types.cc' || echo '$(srcdir)/'`rgw/rgw_basic_types.cc
+
+rgw/test_build_librgw-rgw_basic_types.obj: rgw/rgw_basic_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -MT rgw/test_build_librgw-rgw_basic_types.obj -MD -MP -MF rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Tpo -c -o rgw/test_build_librgw-rgw_basic_types.obj `if test -f 'rgw/rgw_basic_types.cc'; then $(CYGPATH_W) 'rgw/rgw_basic_types.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_basic_types.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Tpo rgw/$(DEPDIR)/test_build_librgw-rgw_basic_types.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_basic_types.cc' object='rgw/test_build_librgw-rgw_basic_types.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -c -o rgw/test_build_librgw-rgw_basic_types.obj `if test -f 'rgw/rgw_basic_types.cc'; then $(CYGPATH_W) 'rgw/rgw_basic_types.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_basic_types.cc'; fi`
+
 rgw/test_build_librgw-rgw_common.o: rgw/rgw_common.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -MT rgw/test_build_librgw-rgw_common.o -MD -MP -MF rgw/$(DEPDIR)/test_build_librgw-rgw_common.Tpo -c -o rgw/test_build_librgw-rgw_common.o `test -f 'rgw/rgw_common.cc' || echo '$(srcdir)/'`rgw/rgw_common.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/test_build_librgw-rgw_common.Tpo rgw/$(DEPDIR)/test_build_librgw-rgw_common.Po
@@ -24355,6 +24941,230 @@ test/librbd/unittest_librbd-test_mock_fixture.obj: test/librbd/test_mock_fixture
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_fixture.obj `if test -f 'test/librbd/test_mock_fixture.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_fixture.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_mock_fixture.cc'; fi`
 
+test/librbd/unittest_librbd-test_mock_ExclusiveLock.o: test/librbd/test_mock_ExclusiveLock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/unittest_librbd-test_mock_ExclusiveLock.o -MD -MP -MF test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Tpo -c -o test/librbd/unittest_librbd-test_mock_ExclusiveLock.o `test -f 'test/librbd/test_mock_ExclusiveLock.cc' || echo '$(srcdir)/'`test/librbd/test_mock_ExclusiveLock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Tpo test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_mock_ExclusiveLock.cc' object='test/librbd/unittest_librbd-test_mock_ExclusiveLock.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_ExclusiveLock.o `test -f 'test/librbd/test_mock_ExclusiveLock.cc' || echo '$(srcdir)/'`test/librbd/test_mock_ExclusiveLock.cc
+
+test/librbd/unittest_librbd-test_mock_ExclusiveLock.obj: test/librbd/test_mock_ExclusiveLock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/unittest_librbd-test_mock_ExclusiveLock.obj -MD -MP -MF test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Tpo -c -o test/librbd/unittest_librbd-test_mock_ExclusiveLock.obj `if test -f 'test/librbd/test_mock_ExclusiveLock.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_ExclusiveLock.cc'; else $(CYGPATH_W) '$(srcdir)/te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Tpo test/librbd/$(DEPDIR)/unittest_librbd-test_mock_ExclusiveLock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_mock_ExclusiveLock.cc' object='test/librbd/unittest_librbd-test_mock_ExclusiveLock.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_ExclusiveLock.obj `if test -f 'test/librbd/test_mock_ExclusiveLock.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_ExclusiveLock.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_mock_ExclusiveLock.cc'; fi`
+
+test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o: test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o -MD -MP -MF test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Tpo -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o `test -f 'test/librbd/exclusive_lock/test_mock_AcquireRequest.cc' || echo '$(srcdir)/'`test/librbd/ex [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Tpo test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/exclusive_lock/test_mock_AcquireRequest.cc' object='test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.o `test -f 'test/librbd/exclusive_lock/test_mock_AcquireRequest.cc' || echo '$(srcdir)/'`test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
+
+test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.obj: test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.obj -MD -MP -MF test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Tpo -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.obj `if test -f 'test/librbd/exclusive_lock/test_mock_AcquireRequest.cc'; then $(CYGPATH_W) 'test/lib [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Tpo test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_AcquireRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/exclusive_lock/test_mock_AcquireRequest.cc' object='test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_AcquireRequest.obj `if test -f 'test/librbd/exclusive_lock/test_mock_AcquireRequest.cc'; then $(CYGPATH_W) 'test/librbd/exclusive_lock/test_mock_AcquireRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc'; fi`
+
+test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.o: test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.o -MD -MP -MF test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Tpo -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.o `test -f 'test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc' || echo '$(srcdir)/'`test/librbd/ex [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Tpo test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc' object='test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.o `test -f 'test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc' || echo '$(srcdir)/'`test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc
+
+test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.obj: test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.obj -MD -MP -MF test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Tpo -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.obj `if test -f 'test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc'; then $(CYGPATH_W) 'test/lib [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Tpo test/librbd/exclusive_lock/$(DEPDIR)/unittest_librbd-test_mock_ReleaseRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc' object='test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/exclusive_lock/unittest_librbd-test_mock_ReleaseRequest.obj `if test -f 'test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc'; then $(CYGPATH_W) 'test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.o: test/librbd/object_map/test_mock_InvalidateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.o `test -f 'test/librbd/object_map/test_mock_InvalidateRequest.cc' || echo '$(srcdir)/'`test/librbd/object [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_InvalidateRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.o `test -f 'test/librbd/object_map/test_mock_InvalidateRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_InvalidateRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.obj: test/librbd/object_map/test_mock_InvalidateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.obj `if test -f 'test/librbd/object_map/test_mock_InvalidateRequest.cc'; then $(CYGPATH_W) 'test/librbd/ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_InvalidateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_InvalidateRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_InvalidateRequest.obj `if test -f 'test/librbd/object_map/test_mock_InvalidateRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_InvalidateRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_InvalidateRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_LockRequest.o: test/librbd/object_map/test_mock_LockRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_LockRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_LockRequest.o `test -f 'test/librbd/object_map/test_mock_LockRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_LockRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_LockRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_LockRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_LockRequest.o `test -f 'test/librbd/object_map/test_mock_LockRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_LockRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_LockRequest.obj: test/librbd/object_map/test_mock_LockRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_LockRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_LockRequest.obj `if test -f 'test/librbd/object_map/test_mock_LockRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_Loc [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_LockRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_LockRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_LockRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_LockRequest.obj `if test -f 'test/librbd/object_map/test_mock_LockRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_LockRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_LockRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.o: test/librbd/object_map/test_mock_RefreshRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.o `test -f 'test/librbd/object_map/test_mock_RefreshRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mo [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_RefreshRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.o `test -f 'test/librbd/object_map/test_mock_RefreshRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_RefreshRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.obj: test/librbd/object_map/test_mock_RefreshRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.obj `if test -f 'test/librbd/object_map/test_mock_RefreshRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/t [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_RefreshRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_RefreshRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_RefreshRequest.obj `if test -f 'test/librbd/object_map/test_mock_RefreshRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_RefreshRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_RefreshRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.o: test/librbd/object_map/test_mock_ResizeRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.o `test -f 'test/librbd/object_map/test_mock_ResizeRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_R [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_ResizeRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.o `test -f 'test/librbd/object_map/test_mock_ResizeRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_ResizeRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.obj: test/librbd/object_map/test_mock_ResizeRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.obj `if test -f 'test/librbd/object_map/test_mock_ResizeRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_ResizeRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_ResizeRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_ResizeRequest.obj `if test -f 'test/librbd/object_map/test_mock_ResizeRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_ResizeRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_ResizeRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.o: test/librbd/object_map/test_mock_SnapshotCreateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.o `test -f 'test/librbd/object_map/test_mock_SnapshotCreateRequest.cc' || echo '$(srcdir)/'`te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_SnapshotCreateRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.o `test -f 'test/librbd/object_map/test_mock_SnapshotCreateRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_SnapshotCreateRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.obj: test/librbd/object_map/test_mock_SnapshotCreateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.obj `if test -f 'test/librbd/object_map/test_mock_SnapshotCreateRequest.cc'; then $(CYGPATH_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_SnapshotCreateRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotCreateRequest.obj `if test -f 'test/librbd/object_map/test_mock_SnapshotCreateRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_SnapshotCreateRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_SnapshotCreateRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.o: test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.o `test -f 'test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc' || echo '$(srcdir)/'`te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.o `test -f 'test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.obj: test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.obj `if test -f 'test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc'; then $(CYGPATH_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRemoveRequest.obj `if test -f 'test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.o: test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.o `test -f 'test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc' || echo '$(srcd [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.o `test -f 'test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.obj: test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.obj `if test -f 'test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc'; then $( [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRollbackRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_SnapshotRollbackRequest.obj `if test -f 'test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.o: test/librbd/object_map/test_mock_UnlockRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.o `test -f 'test/librbd/object_map/test_mock_UnlockRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_U [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_UnlockRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.o `test -f 'test/librbd/object_map/test_mock_UnlockRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_UnlockRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.obj: test/librbd/object_map/test_mock_UnlockRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.obj `if test -f 'test/librbd/object_map/test_mock_UnlockRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UnlockRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_UnlockRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_UnlockRequest.obj `if test -f 'test/librbd/object_map/test_mock_UnlockRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_UnlockRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_UnlockRequest.cc'; fi`
+
+test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.o: test/librbd/object_map/test_mock_UpdateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.o -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.o `test -f 'test/librbd/object_map/test_mock_UpdateRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_U [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_UpdateRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.o `test -f 'test/librbd/object_map/test_mock_UpdateRequest.cc' || echo '$(srcdir)/'`test/librbd/object_map/test_mock_UpdateRequest.cc
+
+test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.obj: test/librbd/object_map/test_mock_UpdateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.obj -MD -MP -MF test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Tpo -c -o test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.obj `if test -f 'test/librbd/object_map/test_mock_UpdateRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Tpo test/librbd/object_map/$(DEPDIR)/unittest_librbd-test_mock_UpdateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/object_map/test_mock_UpdateRequest.cc' object='test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/object_map/unittest_librbd-test_mock_UpdateRequest.obj `if test -f 'test/librbd/object_map/test_mock_UpdateRequest.cc'; then $(CYGPATH_W) 'test/librbd/object_map/test_mock_UpdateRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/object_map/test_mock_UpdateRequest.cc'; fi`
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.o: test/librbd/operation/test_mock_SnapshotCreateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.o -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotCreateRequest.cc' || echo '$(srcdir)/'`test/l [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotCreateRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotCreateRequest.cc' || echo '$(srcdir)/'`test/librbd/operation/test_mock_SnapshotCreateRequest.cc
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.obj: test/librbd/operation/test_mock_SnapshotCreateRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.obj -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotCreateRequest.cc'; then $(CYGPATH_W) ' [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotCreateRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotCreateRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotCreateRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotCreateRequest.cc'; then $(CYGPATH_W) 'test/librbd/operation/test_mock_SnapshotCreateRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/operation/test_mock_SnapshotCreateRequest.cc'; fi`
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.o: test/librbd/operation/test_mock_SnapshotProtectRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.o -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotProtectRequest.cc' || echo '$(srcdir)/'`te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotProtectRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotProtectRequest.cc' || echo '$(srcdir)/'`test/librbd/operation/test_mock_SnapshotProtectRequest.cc
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.obj: test/librbd/operation/test_mock_SnapshotProtectRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.obj -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotProtectRequest.cc'; then $(CYGPATH_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotProtectRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotProtectRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotProtectRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotProtectRequest.cc'; then $(CYGPATH_W) 'test/librbd/operation/test_mock_SnapshotProtectRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/operation/test_mock_SnapshotProtectRequest.cc'; fi`
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.o: test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.o -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotRemoveRequest.cc' || echo '$(srcdir)/'`test/l [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotRemoveRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotRemoveRequest.cc' || echo '$(srcdir)/'`test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.obj: test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.obj -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotRemoveRequest.cc'; then $(CYGPATH_W) ' [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotRemoveRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotRemoveRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotRemoveRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotRemoveRequest.cc'; then $(CYGPATH_W) 'test/librbd/operation/test_mock_SnapshotRemoveRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc'; fi`
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.o: test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.o -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc' || echo '$(srcd [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.o `test -f 'test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc' || echo '$(srcdir)/'`test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
+
+test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.obj: test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.obj -MD -MP -MF test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Tpo -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc'; then $( [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Tpo test/librbd/operation/$(DEPDIR)/unittest_librbd-test_mock_SnapshotUnprotectRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc' object='test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/operation/unittest_librbd-test_mock_SnapshotUnprotectRequest.obj `if test -f 'test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc'; then $(CYGPATH_W) 'test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc'; fi`
+
 log/unittest_log-test.o: log/test.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_log_CXXFLAGS) $(CXXFLAGS) -MT log/unittest_log-test.o -MD -MP -MF log/$(DEPDIR)/unittest_log-test.Tpo -c -o log/unittest_log-test.o `test -f 'log/test.cc' || echo '$(srcdir)/'`log/test.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) log/$(DEPDIR)/unittest_log-test.Tpo log/$(DEPDIR)/unittest_log-test.Po
@@ -25153,6 +25963,10 @@ clean-libtool:
 	-rm -rf librados/.libs librados/_libs
 	-rm -rf libradosstriper/.libs libradosstriper/_libs
 	-rm -rf librbd/.libs librbd/_libs
+	-rm -rf librbd/exclusive_lock/.libs librbd/exclusive_lock/_libs
+	-rm -rf librbd/image/.libs librbd/image/_libs
+	-rm -rf librbd/object_map/.libs librbd/object_map/_libs
+	-rm -rf librbd/operation/.libs librbd/operation/_libs
 	-rm -rf log/.libs log/_libs
 	-rm -rf mds/.libs mds/_libs
 	-rm -rf mon/.libs mon/_libs
@@ -26643,6 +27457,14 @@ distclean-generic:
 	-rm -f libradosstriper/$(am__dirstamp)
 	-rm -f librbd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f librbd/$(am__dirstamp)
+	-rm -f librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+	-rm -f librbd/exclusive_lock/$(am__dirstamp)
+	-rm -f librbd/image/$(DEPDIR)/$(am__dirstamp)
+	-rm -f librbd/image/$(am__dirstamp)
+	-rm -f librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+	-rm -f librbd/object_map/$(am__dirstamp)
+	-rm -f librbd/operation/$(DEPDIR)/$(am__dirstamp)
+	-rm -f librbd/operation/$(am__dirstamp)
 	-rm -f log/$(DEPDIR)/$(am__dirstamp)
 	-rm -f log/$(am__dirstamp)
 	-rm -f mds/$(DEPDIR)/$(am__dirstamp)
@@ -26731,6 +27553,12 @@ distclean-generic:
 	-rm -f test/libradosstriper/$(am__dirstamp)
 	-rm -f test/librbd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/librbd/$(am__dirstamp)
+	-rm -f test/librbd/exclusive_lock/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/librbd/exclusive_lock/$(am__dirstamp)
+	-rm -f test/librbd/object_map/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/librbd/object_map/$(am__dirstamp)
+	-rm -f test/librbd/operation/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/librbd/operation/$(am__dirstamp)
 	-rm -f test/mds/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/mds/$(am__dirstamp)
 	-rm -f test/messenger/$(DEPDIR)/$(am__dirstamp)
@@ -26761,6 +27589,8 @@ distclean-generic:
 	-rm -f tools/rbd/$(am__dirstamp)
 	-rm -f tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 	-rm -f tools/rbd/action/$(am__dirstamp)
+	-rm -f tools/rbd_nbd/$(DEPDIR)/$(am__dirstamp)
+	-rm -f tools/rbd_nbd/$(am__dirstamp)
 	-rm -f tracing/$(DEPDIR)/$(am__dirstamp)
 	-rm -f tracing/$(am__dirstamp)
 
@@ -26778,7 +27608,7 @@ clean-am: clean-binPROGRAMS clean-checkPROGRAMS \
 	clean-sbinPROGRAMS clean-su_sbinPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush [...]
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-tags
@@ -26811,7 +27641,7 @@ install-dvi-am:
 
 install-exec-am: install-binPROGRAMS install-binSCRIPTS \
 	install-ceph_libexecSCRIPTS install-dist_binSCRIPTS \
-	install-libLTLIBRARIES install-sbinPROGRAMS \
+	install-exec-local install-libLTLIBRARIES install-sbinPROGRAMS \
 	install-sbinSCRIPTS
 
 install-html: install-html-recursive
@@ -26835,7 +27665,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush [...]
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -26885,14 +27715,14 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	install-ceph_sbinSCRIPTS install-data install-data-am \
 	install-data-local install-dist_binSCRIPTS install-docDATA \
 	install-dvi install-dvi-am install-erasure_codelibLTLIBRARIES \
-	install-exec install-exec-am install-html install-html-am \
-	install-info install-info-am install-libLTLIBRARIES \
-	install-libcephfs_includeDATA install-librbd_includeDATA \
-	install-man install-pdf install-pdf-am install-ps \
-	install-ps-am install-pythonPYTHON install-rados_includeDATA \
-	install-radoslibLTLIBRARIES install-radosstriper_includeDATA \
-	install-sbinPROGRAMS install-sbinSCRIPTS \
-	install-shell_commonSCRIPTS install-strip \
+	install-exec install-exec-am install-exec-local install-html \
+	install-html-am install-info install-info-am \
+	install-libLTLIBRARIES install-libcephfs_includeDATA \
+	install-librbd_includeDATA install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-pythonPYTHON \
+	install-rados_includeDATA install-radoslibLTLIBRARIES \
+	install-radosstriper_includeDATA install-sbinPROGRAMS \
+	install-sbinSCRIPTS install-shell_commonSCRIPTS install-strip \
 	install-su_sbinPROGRAMS install-su_sbinSCRIPTS installcheck \
 	installcheck-am installdirs installdirs-am maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-compile \
@@ -26923,22 +27753,19 @@ export PYTHONPATH=$(top_srcdir)/src/pybind
 		else
 				HARDENING_CFLAGS += -fstack-protector
 		endif
- at CLANG_FALSE@	AM_COMMON_CFLAGS += -rdynamic
+ at CLANG_FALSE@	AM_COMMON_CFLAGS += ${RDYNAMIC_FLAG}
 @SOLARIS_TRUE@	AM_COMMON_CFLAGS += -Wno-unused-local-typedefs
 @CLANG_FALSE@	AM_CXXFLAGS += -Wstrict-null-sentinel
 
-# solaris harding
- at SOLARIS_TRUE@	AM_CXXFLAGS += -lssp_nonshared
-
 @NO_GIT_VERSION_TRUE at export NO_VERSION="yes"
 
-all-local::
+ceph-detect-init-all:
 	cd $(srcdir)/ceph-detect-init ; python setup.py build
 
-clean-local::
+ceph-detect-init-clean:
 	cd $(srcdir)/ceph-detect-init ; python setup.py clean ; rm -fr wheelhouse .tox build .coverage *.egg-info
 
-install-data-local::
+ceph-detect-init-install-data:
 	cd $(srcdir)/ceph-detect-init ; \
 	if test "$(DESTDIR)" ; then \
 		if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \
@@ -26975,6 +27802,8 @@ erasure-code/shec/ErasureCodePluginSelectShec.cc: ./ceph_ver.h
 
 @WITH_BETTER_YASM_ELF64_TRUE at erasure-code/isa/ErasureCodePluginIsa.cc: ./ceph_ver.h
 erasure-code/ErasureCodePlugin.cc: ./ceph_ver.h
+
+common/PluginRegistry.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/ErasureCodePluginExample.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/ErasureCodePluginMissingEntryPoint.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/ErasureCodePluginHangs.cc: ./ceph_ver.h
@@ -26994,6 +27823,35 @@ unittests:: $(check_PROGRAMS)
 
 @WITH_LTTNG_TRUE at tracing/%.h: tracing/%.tp
 @WITH_LTTNG_TRUE@	$(LTTNG_GEN_TP_PROG) $< -o tracing/$*.h
+
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at pybind-all: librbd.la ${srcdir}/ceph_ver.h
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	cd $(srcdir)/pybind; $(PY_DISTUTILS) build \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--build-base $(shell readlink -f $(builddir))/build \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--verbose
+
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at pybind-clean: ${srcdir}/ceph_ver.h
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	cd $(srcdir)/pybind; $(PY_DISTUTILS) clean \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--build-base $(shell readlink -f $(builddir))/build \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--verbose
+
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at pybind-install-exec: ${srcdir}/ceph_ver.h
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	if test "$(DESTDIR)" ; then \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@		if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@			options=--install-layout=deb ; \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@		else \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@			options=--prefix=/usr ; \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@		fi ; \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@		root="--root=$(DESTDIR)" ; \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	else \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@		options=--prefix=$(prefix) ; \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	fi ; \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	cd $(srcdir)/pybind; $(PY_DISTUTILS) build \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--build-base $(shell readlink -f $(builddir))/build \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	install \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$$options $$root \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--single-version-externally-managed \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--record /dev/null \
+ at ENABLE_CLIENT_TRUE@@WITH_CYTHON_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	--verbose
 $(shell_scripts): Makefile
 $(shell_scripts): %: %.in
 	rm -f $@ $@.tmp
@@ -27045,7 +27903,7 @@ dist-hook:
 
 # cleaning
 
-clean-local::
+base-clean-local::
 	rm -f *.so 
 	find . -name '*.gcno' -o -name '*.gcda' -o -name '*.lcov' -o -name "*.o" -o -name "*.lo" | xargs rm -f
 	rm -f ceph java/java/com/ceph/crush/Bucket.class
@@ -27066,7 +27924,7 @@ uninstall-coverage:
 check-coverage:
 @ENABLE_COVERAGE_TRUE@	-test/coverage.sh -d $(srcdir) -o check-coverage make check
 
-install-data-local:: install-coverage
+base-install-data-local:: install-coverage
 	-mkdir -p $(DESTDIR)$(sysconfdir)/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/log/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
@@ -27106,6 +27964,13 @@ coverity-submit:
 @ENABLE_CLIENT_TRUE@	chmod a-w $@.tmp
 @ENABLE_CLIENT_TRUE@	mv $@.tmp $@
 
+# local targets
+
+all-local: $(LOCAL_ALL)
+clean-local: $(LOCAL_CLEAN)
+install-exec-local: $(LOCAL_INSTALLEXEC)
+install-data-local: $(LOCAL_INSTALLDATA)
+
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 0090f4f..fd61271 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -59,6 +59,12 @@
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
+/* Have eventfd extension. */
+#undef HAVE_EVENTFD
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#undef HAVE_EXECINFO_H
+
 /* Define to 1 if you have fdatasync. */
 #undef HAVE_FDATASYNC
 
@@ -232,6 +238,9 @@
 /* Define if you have pthread_spin_init */
 #undef HAVE_PTHREAD_SPINLOCK
 
+/* Define if you have res_nquery */
+#undef HAVE_RES_NQUERY
+
 /* Define to 1 if you have sched.h. */
 #undef HAVE_SCHED
 
@@ -293,6 +302,9 @@
    */
 #undef HAVE_SYS_DIR_H
 
+/* Define to 1 if you have the <sys/eventfd.h> header file. */
+#undef HAVE_SYS_EVENTFD_H
+
 /* Define to 1 if you have the <sys/file.h> header file. */
 #undef HAVE_SYS_FILE_H
 
diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc
index 24c4bd0..000a5b4 100644
--- a/src/auth/Crypto.cc
+++ b/src/auth/Crypto.cc
@@ -243,7 +243,6 @@ static int nss_aes_operation(CK_ATTRIBUTE_TYPE op,
 			 out_tmp.length()-written);
   PK11_DestroyContext(ectx, PR_TRUE);
   if (ret != SECSuccess) {
-    PK11_DestroyContext(ectx, PR_TRUE);
     if (error) {
       ostringstream oss;
       oss << "NSS AES final round failed: " << PR_GetError();
@@ -420,6 +419,8 @@ int CryptoKey::_set_secret(int t, const bufferptr& s)
     if (error.length()) {
       return -EIO;
     }
+  } else {
+      return -EOPNOTSUPP;
   }
   type = t;
   secret = s;
diff --git a/src/auth/cephx/CephxServiceHandler.cc b/src/auth/cephx/CephxServiceHandler.cc
index d65ac79..7b6212a 100644
--- a/src/auth/cephx/CephxServiceHandler.cc
+++ b/src/auth/cephx/CephxServiceHandler.cc
@@ -139,6 +139,13 @@ int CephxServiceHandler::handle_request(bufferlist::iterator& indata, bufferlist
 
       if (!key_server->get_service_caps(entity_name, CEPH_ENTITY_TYPE_MON, caps)) {
         ldout(cct, 0) << " could not get mon caps for " << entity_name << dendl;
+        ret = -EACCES;
+      } else {
+        char *caps_str = caps.caps.c_str();
+        if (!caps_str || !caps_str[0]) {
+          ldout(cct,0) << "mon caps null for " << entity_name << dendl;
+          ret = -EACCES;
+        }
       }
     }
     break;
@@ -183,8 +190,10 @@ int CephxServiceHandler::handle_request(bufferlist::iterator& indata, bufferlist
     {
       ldout(cct, 10) << "handle_request getting rotating secret for " << entity_name << dendl;
       build_cephx_response_header(cephx_header.request_type, 0, result_bl);
-      key_server->get_rotating_encrypted(entity_name, result_bl);
-      ret = 0;
+      if (!key_server->get_rotating_encrypted(entity_name, result_bl)) {
+        ret = -EPERM;
+        break;
+      }
     }
     break;
 
diff --git a/src/ceph-detect-init/Makefile.am b/src/ceph-detect-init/Makefile.am
index a2c885a..932f755 100644
--- a/src/ceph-detect-init/Makefile.am
+++ b/src/ceph-detect-init/Makefile.am
@@ -53,13 +53,13 @@ EXTRA_DIST += \
 	ceph-detect-init/tests/test_all.py \
 	ceph-detect-init/tox.ini
 
-all-local::
+ceph-detect-init-all:
 	cd $(srcdir)/ceph-detect-init ; python setup.py build
 
-clean-local::
+ceph-detect-init-clean:
 	cd $(srcdir)/ceph-detect-init ; python setup.py clean ; rm -fr wheelhouse .tox build .coverage *.egg-info
 
-install-data-local::
+ceph-detect-init-install-data:
 	cd $(srcdir)/ceph-detect-init ; \
 	if test "$(DESTDIR)" ; then \
 		if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \
@@ -70,3 +70,7 @@ install-data-local::
 		root="--root=$(DESTDIR)" ; \
 	fi ; \
 	python setup.py install $$root $$options
+
+LOCAL_ALL += ceph-detect-init-all
+LOCAL_CLEAN += ceph-detect-init-clean
+LOCAL_INSTALLDATA += ceph-detect-init-install-data
diff --git a/src/ceph-detect-init/tests/test_all.py b/src/ceph-detect-init/tests/test_all.py
index 069a0ed..4c408f9 100644
--- a/src/ceph-detect-init/tests/test_all.py
+++ b/src/ceph-detect-init/tests/test_all.py
@@ -50,6 +50,9 @@ class TestCephDetectInit(testtools.TestCase):
         with mock.patch('ceph_detect_init.debian.distro',
                         'ubuntu'):
             self.assertEqual('upstart', debian.choose_init())
+            with mock.patch('ceph_detect_init.debian.codename',
+                            'vivid'):
+                self.assertEqual('systemd', debian.choose_init())
 
     def test_fedora(self):
         with mock.patch('ceph_detect_init.fedora.release',
diff --git a/src/ceph-disk b/src/ceph-disk
index 7f4a009..0a91fa9 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -2870,9 +2870,7 @@ def get_journal_osd_uuid(path):
         out = _check_output(
             args=[
                 'ceph-osd',
-                '-i', '0',   # this is ignored
-                '--get-journal-uuid',
-                '--osd-journal',
+                '--get-device-fsid',
                 path,
                 ],
             close_fds=True,
diff --git a/src/ceph.in b/src/ceph.in
index bd2a6dd..1142b20 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -718,6 +718,10 @@ def main():
     except KeyboardInterrupt:
         print >> sys.stderr, 'Cluster connection aborted'
         return 1
+    except rados.PermissionDeniedError as e:
+        print >> sys.stderr, 'Error connecting to cluster: {0}'.\
+            format(e.__class__.__name__)
+        return errno.EACCES
     except Exception as e:
         print >> sys.stderr, 'Error connecting to cluster: {0}'.\
             format(e.__class__.__name__)
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 4d166eb..6d5c946 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -37,9 +37,32 @@ using namespace std;
 #include <sys/types.h>
 #include <fcntl.h>
 
+#include <fuse.h>
+
+static void fuse_usage()
+{
+  const char **argv = (const char **) malloc((2) * sizeof(char *));
+  argv[0] = "ceph-fuse";
+  argv[1] = "-h";
+  struct fuse_args args = FUSE_ARGS_INIT(2, (char**)argv);
+  if (fuse_parse_cmdline(&args, NULL, NULL, NULL) == -1) {
+    derr << "fuse_parse_cmdline failed." << dendl;
+    fuse_opt_free_args(&args);
+    free(argv);
+  }
+
+  assert(args.allocated);  // Checking fuse has realloc'd args so we can free newargv
+  free(argv);
+}
 void usage()
 {
-  cerr << "usage: ceph-fuse [-m mon-ip-addr:mon-port] <mount point>" << std::endl;
+  cout <<
+"usage: ceph-fuse [-m mon-ip-addr:mon-port] <mount point> [OPTIONS]\n"
+"  --client_mountpoint/-r <root_directory>\n"
+"                    use root_directory as the mounted root, rather than the full Ceph tree.\n"
+"\n";
+  fuse_usage();
+  generic_client_usage();
 }
 
 int main(int argc, const char **argv, const char *envp[]) {
@@ -57,6 +80,9 @@ int main(int argc, const char **argv, const char *envp[]) {
     } else if (ceph_argparse_flag(args, i, "--localize-reads", (char*)NULL)) {
       cerr << "setting CEPH_OSD_FLAG_LOCALIZE_READS" << std::endl;
       filer_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+    } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+      usage();
+      assert(0);
     } else {
       ++i;
     }
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index 8b88f58..16d71da 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -105,6 +105,10 @@ int main(int argc, const char **argv)
     if (ceph_argparse_double_dash(args, i)) {
       break;
     }
+    else if (ceph_argparse_flag(args, i, "--help", "-h", (char*)NULL)) {
+      usage();
+      break;
+    }
     else if (ceph_argparse_witharg(args, i, &val, "--journal-check", (char*)NULL)) {
       int r = parse_rank("journal-check", val);
       if (shadow != MDSMap::STATE_NULL) {
@@ -120,7 +124,7 @@ int main(int argc, const char **argv)
     }
     else if (ceph_argparse_witharg(args, i, &val, "--hot-standby", (char*)NULL)) {
       int r = parse_rank("hot-standby", val);
-      if (shadow) {
+      if (shadow != MDSMap::STATE_NULL) {
         dout(0) << "Error: can only select one standby state" << dendl;
         return -1;
       }
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 7a429ff..fb5de36 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -71,8 +71,8 @@ void handle_osd_signal(int signum)
 void usage() 
 {
   cout << "usage: ceph-osd -i <osdid>\n"
-       << "  --osd-data=path   data directory\n"
-       << "  --osd-journal=path\n"
+       << "  --osd-data PATH data directory\n"
+       << "  --osd-journal PATH\n"
        << "                    journal file or block device\n"
        << "  --mkfs            create a [new] data directory\n"
        << "  --convert-filestore\n"
@@ -85,7 +85,9 @@ void usage()
        << "                    check whether a journal is allowed\n"
        << "  --check-needs-journal\n"
        << "                    check whether a journal is required\n"
-       << "  --debug_osd N     set debug level (e.g. 10)"
+       << "  --debug_osd <N>   set debug level (e.g. 10)"
+       << "  --get-device-fsid PATH\n"
+       << "                    get OSD fsid for the given block device\n"
        << std::endl;
   generic_server_usage();
   cout.flush();
@@ -130,9 +132,11 @@ int main(int argc, const char **argv)
   bool flushjournal = false;
   bool dump_journal = false;
   bool convertfilestore = false;
-  bool get_journal_fsid = false;
   bool get_osd_fsid = false;
   bool get_cluster_fsid = false;
+  bool get_journal_fsid = false;
+  bool get_device_fsid = false;
+  string device_path;
   std::string dump_pg_log;
 
   std::string val;
@@ -168,6 +172,9 @@ int main(int argc, const char **argv)
       get_osd_fsid = true;
     } else if (ceph_argparse_flag(args, i, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL)) {
       get_journal_fsid = true;
+    } else if (ceph_argparse_witharg(args, i, &device_path,
+				     "--get-device-fsid", (char*)NULL)) {
+      get_device_fsid = true;
     } else {
       ++i;
     }
@@ -177,6 +184,22 @@ int main(int argc, const char **argv)
     usage();
   }
 
+  if (get_journal_fsid) {
+    device_path = g_conf->osd_journal;
+    get_device_fsid = true;
+  }
+  if (get_device_fsid) {
+    uuid_d uuid;
+    int r = ObjectStore::probe_block_device_fsid(device_path, &uuid);
+    if (r < 0) {
+      cerr << "failed to get device fsid for " << device_path
+	   << ": " << cpp_strerror(r) << std::endl;
+      exit(1);
+    }
+    cout << uuid << std::endl;
+    return 0;
+  }
+
   if (!dump_pg_log.empty()) {
     common_init_finish(g_ceph_context);
     bufferlist bl;
@@ -217,8 +240,23 @@ int main(int argc, const char **argv)
   }
 
   // the store
+  string store_type = g_conf->osd_objectstore;
+  {
+    char fn[PATH_MAX];
+    snprintf(fn, sizeof(fn), "%s/type", g_conf->osd_data.c_str());
+    int fd = ::open(fn, O_RDONLY);
+    if (fd >= 0) {
+      bufferlist bl;
+      bl.read_fd(fd, 64);
+      if (bl.length()) {
+	store_type = string(bl.c_str(), bl.length() - 1);  // drop \n
+	dout(5) << "object store type is " << store_type << dendl;
+      }
+      ::close(fd);
+    }
+  }
   ObjectStore *store = ObjectStore::create(g_ceph_context,
-					   g_conf->osd_objectstore,
+					   store_type,
 					   g_conf->osd_data,
 					   g_conf->osd_journal);
   if (!store) {
@@ -365,14 +403,6 @@ int main(int argc, const char **argv)
     exit(0);
   }
   
-  if (get_journal_fsid) {
-    uuid_d fsid;
-    int r = store->peek_journal_fsid(&fsid);
-    if (r == 0)
-      cout << fsid << std::endl;
-    exit(r);
-  }
-
   string magic;
   uuid_d cluster_fsid, osd_fsid;
   int w;
diff --git a/src/client/Client.cc b/src/client/Client.cc
index aab3052..9a0c36a 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -554,6 +554,12 @@ void Client::shutdown()
 {
   ldout(cct, 1) << "shutdown" << dendl;
 
+  // If we were not mounted, but were being used for sending
+  // MDS commands, we may have sessions that need closing.
+  client_lock.Lock();
+  _close_sessions();
+  client_lock.Unlock();
+
   cct->_conf->remove_observer(this);
 
   AdminSocket* admin_socket = cct->get_admin_socket();
@@ -2492,14 +2498,16 @@ void Client::send_reconnect(MetaSession *session)
       bufferlist flockbl;
       _encode_filelocks(in, flockbl);
 
-      in->caps[mds]->seq = 0;  // reset seq.
-      in->caps[mds]->issue_seq = 0;  // reset seq.
-      in->caps[mds]->mseq = 0;  // reset seq.
+      Cap *cap = in->caps[mds];
+      cap->seq = 0;  // reset seq.
+      cap->issue_seq = 0;  // reset seq.
+      cap->mseq = 0;  // reset seq.
+      cap->issued = cap->implemented;
       m->add_cap(p->first.ino, 
-		 in->caps[mds]->cap_id,
+		 cap->cap_id,
 		 path.get_ino(), path.get_path(),   // ino
 		 in->caps_wanted(), // wanted
-		 in->caps[mds]->issued,     // issued
+		 cap->issued,     // issued
 		 in->snaprealm->ino,
 		 flockbl);
 
@@ -2857,16 +2865,9 @@ void Client::put_cap_ref(Inode *in, int cap)
     if (last & CEPH_CAP_FILE_CACHE) {
       ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
       ++put_nref;
-      // release clean pages too, if we dont want RDCACHE
-      if (!(in->caps_wanted() & CEPH_CAP_FILE_CACHE))
-	drop |= CEPH_CAP_FILE_CACHE;
-    }
-    if (drop) {
-      if (drop & CEPH_CAP_FILE_CACHE)
-	_invalidate_inode_cache(in);
-      else
-	check_caps(in, false);
     }
+    if (drop)
+      check_caps(in, false);
     if (put_nref)
       put_inode(in, put_nref);
   }
@@ -3087,6 +3088,10 @@ void Client::check_caps(Inode *in, bool is_delayed)
     wanted |= CEPH_CAP_FILE_EXCL;
   }
 
+  int implemented;
+  int issued = in->caps_issued(&implemented);
+  int revoking = implemented & ~issued;
+
   int retain = wanted | used | CEPH_CAP_PIN;
   if (!unmounting) {
     if (wanted)
@@ -3098,6 +3103,8 @@ void Client::check_caps(Inode *in, bool is_delayed)
   ldout(cct, 10) << "check_caps on " << *in
 	   << " wanted " << ccap_string(wanted)
 	   << " used " << ccap_string(used)
+	   << " issued " << ccap_string(issued)
+	   << " revoking " << ccap_string(revoking)
 	   << " is_delayed=" << is_delayed
 	   << dendl;
 
@@ -3107,6 +3114,10 @@ void Client::check_caps(Inode *in, bool is_delayed)
   if (in->caps.empty())
     return;   // guard if at end of func
 
+  if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
+      (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
+    _release(in);
+
   if (!in->cap_snaps.empty())
     flush_snaps(in);
 
@@ -3130,7 +3141,7 @@ void Client::check_caps(Inode *in, bool is_delayed)
     if (in->auth_cap && cap != in->auth_cap)
       cap_used &= ~in->auth_cap->issued;
 
-    int revoking = cap->implemented & ~cap->issued;
+    revoking = cap->implemented & ~cap->issued;
     
     ldout(cct, 10) << " cap mds." << mds
 	     << " issued " << ccap_string(cap->issued)
@@ -3385,41 +3396,36 @@ private:
   Client *client;
   InodeRef inode;
   int64_t offset, length;
-  bool keep_caps;
 public:
-  C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len, bool keep) :
-			   client(c), inode(in), offset(off), length(len), keep_caps(keep) {
+  C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
+			   client(c), inode(in), offset(off), length(len) {
   }
   void finish(int r) {
     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
     assert(!client->client_lock.is_locked_by_me());
-    client->_async_invalidate(inode, offset, length, keep_caps);
+    client->_async_invalidate(inode, offset, length);
   }
 };
 
-void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps)
+void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len)
 {
-  ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << dendl;
+  ldout(cct, 10) << "_async_invalidate " << off << "~" << len << dendl;
   if (use_faked_inos())
     ino_invalidate_cb(callback_handle, vinodeno_t(in->faked_ino, CEPH_NOSNAP), off, len);
   else
     ino_invalidate_cb(callback_handle, in->vino(), off, len);
 
   client_lock.Lock();
-  if (!keep_caps)
-    check_caps(in.get(), false);
   in.reset(); // put inode inside client_lock
   client_lock.Unlock();
-  ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << " done" << dendl;
+  ldout(cct, 10) << "_async_invalidate " << off << "~" << len << " done" << dendl;
 }
 
-void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps) {
+void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
 
   if (ino_invalidate_cb)
     // we queue the invalidate, which calls the callback and decrements the ref
-    async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len, keep_caps));
-  else if (!keep_caps)
-    check_caps(in, false);
+    async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
 }
 
 void Client::_invalidate_inode_cache(Inode *in)
@@ -3430,7 +3436,7 @@ void Client::_invalidate_inode_cache(Inode *in)
   if (cct->_conf->client_oc)
     objectcacher->release_set(&in->oset);
 
-  _schedule_invalidate_callback(in, 0, 0, false);
+  _schedule_invalidate_callback(in, 0, 0);
 }
 
 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
@@ -3444,15 +3450,17 @@ void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
     objectcacher->discard_set(&in->oset, ls);
   }
 
-  _schedule_invalidate_callback(in, off, len, true);
+  _schedule_invalidate_callback(in, off, len);
 }
 
-void Client::_release(Inode *in)
+bool Client::_release(Inode *in)
 {
   ldout(cct, 20) << "_release " << *in << dendl;
   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
     _invalidate_inode_cache(in);
+    return true;
   }
+  return false;
 }
 
 bool Client::_flush(Inode *in, Context *onfinish)
@@ -4689,7 +4697,8 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
         && !_flush(in, new C_Client_FlushComplete(this, in))) {
       // waitin' for flush
     } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
-      _release(in);
+      if (_release(in))
+	check = true;
     } else {
       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
       check = true;
@@ -5035,15 +5044,21 @@ int Client::mount(const std::string &mount_root, bool require_mds)
   ldout(cct, 2) << "mounted: have mdsmap " << mdsmap->get_epoch() << dendl;
   if (require_mds) {
     while (1) {
-      if (mdsmap->get_epoch() > 0) {
-        if (mdsmap->get_num_mds(CEPH_MDS_STATE_ACTIVE) == 0) {
-          ldout(cct, 10) << "no mds up: epoch=" << mdsmap->get_epoch() << dendl;
-          return CEPH_FUSE_NO_MDS_UP;
-        } else {
-          break;
-        }
-      } else {
+      auto availability = mdsmap->is_cluster_available();
+      if (availability == MDSMap::STUCK_UNAVAILABLE) {
+        // Error out
+        ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
+        return CEPH_FUSE_NO_MDS_UP;
+      } else if (availability == MDSMap::AVAILABLE) {
+        // Continue to mount
+        break;
+      } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
+        // Else, wait.  MDSMonitor will update the map to bring
+        // us to a conclusion eventually.
         wait_on_list(waiting_for_mdsmap);
+      } else {
+        // Unexpected value!
+        assert(0);
       }
     }
   }
@@ -5100,6 +5115,24 @@ int Client::mount(const std::string &mount_root, bool require_mds)
 
 // UNMOUNT
 
+void Client::_close_sessions()
+{
+  while (!mds_sessions.empty()) {
+    // send session closes!
+    for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
+	p != mds_sessions.end();
+	++p) {
+      if (p->second->state != MetaSession::STATE_CLOSING) {
+	_close_mds_session(p->second);
+      }
+    }
+
+    // wait for sessions to close
+    ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
+    mount_cond.Wait(client_lock);
+  }
+}
+
 void Client::unmount()
 {
   Mutex::Locker lock(client_lock);
@@ -5184,21 +5217,7 @@ void Client::unmount()
     traceout.close();
   }
 
-  
-  while (!mds_sessions.empty()) {
-    // send session closes!
-    for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
-	p != mds_sessions.end();
-	++p) {
-      if (p->second->state != MetaSession::STATE_CLOSING) {
-	_close_mds_session(p->second);
-      }
-    }
-
-    // wait for sessions to close
-    ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
-    mount_cond.Wait(client_lock);
-  }
+  _close_sessions();
 
   mounted = false;
 
@@ -6256,7 +6275,6 @@ int Client::_opendir(Inode *in, dir_result_t **dirpp, int uid, int gid)
   if (!in->is_dir())
     return -ENOTDIR;
   *dirpp = new dir_result_t(in);
-  (*dirpp)->set_frag(in->dirfragtree[0]);
   if (in->dir) {
     (*dirpp)->release_count = in->dir->release_count;
     (*dirpp)->ordered_count = in->dir->ordered_count;
@@ -7132,13 +7150,7 @@ int Client::_release_fh(Fh *f)
   if (in->snapid == CEPH_NOSNAP) {
     if (in->put_open_ref(f->mode)) {
       _flush(in, new C_Client_FlushComplete(this, in));
-      // release clean pages too, if we dont want RDCACHE
-      if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0 &&
-	  !(in->caps_wanted() & CEPH_CAP_FILE_CACHE) &&
-	  !objectcacher->set_is_empty(&in->oset))
-	_invalidate_inode_cache(in);
-      else
-	check_caps(in, false);
+      check_caps(in, false);
     }
   } else {
     assert(in->snap_cap_refs > 0);
@@ -7687,7 +7699,7 @@ void Client::sync_write_commit(InodeRef& in)
 
   ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
   if (unsafe_sync_write == 0 && unmounting) {
-    ldout(cct, 10) << "sync_write_comit -- no more unsafe writes, unmount can proceed" << dendl;
+    ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
     mount_cond.Signal();
   }
 
@@ -8655,7 +8667,8 @@ int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
   Inode *in = f->inode.get();
   
   _fsync(f, true);
-  _release(in);
+  if (_release(in))
+    check_caps(in, false);
   return 0;
 }
 
diff --git a/src/client/Client.h b/src/client/Client.h
index 0482360..8b31700 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -534,6 +534,8 @@ protected:
    */
   void _handle_full_flag(int64_t pool);
 
+  void _close_sessions();
+
  public:
   void set_filer_flags(int flags);
   void clear_filer_flags(int flags);
@@ -610,11 +612,11 @@ protected:
   void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
   void _try_to_trim_inode(Inode *in);
 
-  void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
+  void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len);
   void _invalidate_inode_cache(Inode *in);
   void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len);
-  void _async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps);
-  void _release(Inode *in);
+  void _async_invalidate(InodeRef& in, int64_t off, int64_t len);
+  bool _release(Inode *in);
   
   /**
    * Initiate a flush of the data associated with the given inode.
diff --git a/src/cls/Makefile-client.am b/src/cls/Makefile-client.am
index f1f7983..642d167 100644
--- a/src/cls/Makefile-client.am
+++ b/src/cls/Makefile-client.am
@@ -41,8 +41,11 @@ libcls_rgw_client_la_SOURCES = \
 noinst_LTLIBRARIES += libcls_rgw_client.la
 DENCODER_DEPS += libcls_rgw_client.la
 
-libcls_rbd_client_la_SOURCES = cls/rbd/cls_rbd_client.cc
+libcls_rbd_client_la_SOURCES = \
+	cls/rbd/cls_rbd_client.cc \
+	cls/rbd/cls_rbd_types.cc
 noinst_LTLIBRARIES += libcls_rbd_client.la
+DENCODER_DEPS += libcls_rbd_client.la
 
 libcls_user_client_a_SOURCES = cls/user/cls_user_client.cc \
 	cls/user/cls_user_types.cc \
@@ -71,6 +74,7 @@ noinst_HEADERS += \
 	cls/numops/cls_numops_client.h \
 	cls/rbd/cls_rbd.h \
 	cls/rbd/cls_rbd_client.h \
+	cls/rbd/cls_rbd_types.h \
 	cls/refcount/cls_refcount_ops.h \
 	cls/refcount/cls_refcount_client.h \
 	cls/version/cls_version_types.h \
diff --git a/src/cls/Makefile-server.am b/src/cls/Makefile-server.am
index cf0b26a..9b081bc 100644
--- a/src/cls/Makefile-server.am
+++ b/src/cls/Makefile-server.am
@@ -10,7 +10,9 @@ libcls_numops_la_SOURCES = cls/numops/cls_numops.cc
 libcls_numops_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_numops.la
 
-libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc
+libcls_rbd_la_SOURCES = \
+	cls/rbd/cls_rbd.cc \
+	cls/rbd/cls_rbd_types.cc
 libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_rbd.la
diff --git a/src/cls/cephfs/cls_cephfs.cc b/src/cls/cephfs/cls_cephfs.cc
index f58f0de..8f54e3a 100644
--- a/src/cls/cephfs/cls_cephfs.cc
+++ b/src/cls/cephfs/cls_cephfs.cc
@@ -123,6 +123,72 @@ static int accumulate_inode_metadata(cls_method_context_t hctx,
   return 0;
 }
 
+// I want to select objects that have a name ending 00000000
+// and an xattr (scrub_tag) not equal to a specific value.
+// This is so special case that we can't really pretend it's
+// generic, so just fess up and call this the cephfs filter.
+class PGLSCephFSFilter : public PGLSFilter {
+protected:
+  std::string scrub_tag;
+public:
+  int init(bufferlist::iterator& params) {
+    try {
+      InodeTagFilterArgs args;
+      args.decode(params);
+      scrub_tag = args.scrub_tag;
+    } catch (buffer::error &e) {
+      return -EINVAL;
+    }
+
+    if (scrub_tag.empty()) {
+      xattr = "";
+    } else {
+      xattr = "_scrub_tag";
+    }
+
+    return 0;
+  }
+
+  virtual ~PGLSCephFSFilter() {}
+  virtual bool reject_empty_xattr() { return false; }
+  virtual bool filter(const hobject_t &obj, bufferlist& xattr_data,
+                      bufferlist& outdata);
+};
+
+bool PGLSCephFSFilter::filter(const hobject_t &obj,
+                             bufferlist& xattr_data, bufferlist& outdata)
+{
+  const std::string need_ending = ".00000000";
+  const std::string &obj_name = obj.oid.name;
+
+  if (obj_name.length() < need_ending.length()) {
+    return false;
+  }
+
+  const bool match = obj_name.compare (obj_name.length() - need_ending.length(), need_ending.length(), need_ending) == 0;
+  if (!match) {
+    return false;
+  }
+
+  if (!scrub_tag.empty() && xattr_data.length() > 0) {
+    std::string tag_ondisk;
+    bufferlist::iterator q = xattr_data.begin();
+    try {
+      ::decode(tag_ondisk, q);
+      if (tag_ondisk == scrub_tag)
+	return false;
+    } catch (const buffer::error &err) {
+    }
+  }
+
+  return true;
+}
+
+PGLSFilter *inode_tag_filter()
+{
+  return new PGLSCephFSFilter();
+}
+
 /**
  * initialize class
  *
@@ -139,5 +205,8 @@ void __cls_init()
   cls_register_cxx_method(h_class, "accumulate_inode_metadata",
 			  CLS_METHOD_WR | CLS_METHOD_RD,
 			  accumulate_inode_metadata, &h_accumulate_inode_metadata);
+
+  // A PGLS filter
+  cls_register_cxx_filter(h_class, "inode_tag", inode_tag_filter);
 }
 
diff --git a/src/cls/cephfs/cls_cephfs.h b/src/cls/cephfs/cls_cephfs.h
index d4a5f23..ca631c6 100644
--- a/src/cls/cephfs/cls_cephfs.h
+++ b/src/cls/cephfs/cls_cephfs.h
@@ -108,6 +108,26 @@ public:
   }
 };
 
+class InodeTagFilterArgs
+{
+  public:
+    std::string scrub_tag;
+
+  void encode(bufferlist &bl) const
+  {
+    ENCODE_START(1, 1, bl);
+    ::encode(scrub_tag, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator &bl)
+  {
+    DECODE_START(1, bl);
+    ::decode(scrub_tag, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
 class AccumulateResult
 {
 public:
diff --git a/src/cls/cephfs/cls_cephfs_client.cc b/src/cls/cephfs/cls_cephfs_client.cc
index c471fde..dc94528 100644
--- a/src/cls/cephfs/cls_cephfs_client.cc
+++ b/src/cls/cephfs/cls_cephfs_client.cc
@@ -144,3 +144,18 @@ int ClsCephFSClient::fetch_inode_accumulate_result(
   return 0;
 }
 
+void ClsCephFSClient::build_tag_filter(
+          const std::string &scrub_tag,
+          bufferlist *out_bl)
+{
+  assert(out_bl != NULL);
+
+  // Leading part of bl is un-versioned string naming the filter
+  ::encode(std::string("cephfs.inode_tag"), *out_bl);
+
+  // Filter-specific part of the bl: in our case this is a versioned structure
+  InodeTagFilterArgs args;
+  args.scrub_tag = scrub_tag;
+  args.encode(*out_bl);
+}
+
diff --git a/src/cls/cephfs/cls_cephfs_client.h b/src/cls/cephfs/cls_cephfs_client.h
index 45d3c4b..ddd8456 100644
--- a/src/cls/cephfs/cls_cephfs_client.h
+++ b/src/cls/cephfs/cls_cephfs_client.h
@@ -22,5 +22,9 @@ class ClsCephFSClient
       inode_backtrace_t *backtrace,
       ceph_file_layout *layout,
       AccumulateResult *result);
+
+  static void build_tag_filter(
+      const std::string &scrub_tag,
+      bufferlist *out_bl);
 };
 
diff --git a/src/cls/journal/cls_journal_types.h b/src/cls/journal/cls_journal_types.h
index dd38b0d..9348739 100644
--- a/src/cls/journal/cls_journal_types.h
+++ b/src/cls/journal/cls_journal_types.h
@@ -5,7 +5,7 @@
 #define CEPH_CLS_JOURNAL_TYPES_H
 
 #include "include/int_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/encoding.h"
 #include <iosfwd>
 #include <list>
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index f83353b..d1e7caa 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -44,6 +44,7 @@
 #include "include/rbd/object_map_types.h"
 
 #include "cls/rbd/cls_rbd.h"
+#include "cls/rbd/cls_rbd_types.h"
 
 
 /*
@@ -110,6 +111,13 @@ cls_method_handle_t h_old_snapshots_list;
 cls_method_handle_t h_old_snapshot_add;
 cls_method_handle_t h_old_snapshot_remove;
 cls_method_handle_t h_old_snapshot_rename;
+cls_method_handle_t h_mirror_is_enabled;
+cls_method_handle_t h_mirror_set_enabled;
+cls_method_handle_t h_mirror_peer_list;
+cls_method_handle_t h_mirror_peer_add;
+cls_method_handle_t h_mirror_peer_remove;
+cls_method_handle_t h_mirror_peer_set_client;
+cls_method_handle_t h_mirror_peer_set_cluster;
 
 #define RBD_MAX_KEYS_READ 64
 #define RBD_SNAP_KEY_PREFIX "snapshot_"
@@ -2506,8 +2514,6 @@ int metadata_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
       break;
 
     map<string, bufferlist>::iterator it = raw_data.begin();
-    if (metadata_name_from_key(it->first) == last_read)
-        ++it;
     for (; it != raw_data.end(); ++it)
       data[metadata_name_from_key(it->first)].swap(it->second);
 
@@ -2903,7 +2909,339 @@ int old_snapshot_rename(cls_method_context_t hctx, bufferlist *in, bufferlist *o
   rc = cls_cxx_write_full(hctx, &newbl);
   if (rc < 0)
     return rc;
+  return 0;
+}
+
+namespace mirror {
+
+static const std::string PEER_KEY_PREFIX("mirror_peer_");
+
+std::string peer_key(const std::string &uuid) {
+  return PEER_KEY_PREFIX + uuid;
+}
+
+int is_enabled(cls_method_context_t hctx, bool *enabled) {
+  bufferlist bl;
+  int r = cls_cxx_map_get_val(hctx, "mirror_enabled", &bl);
+  if (r < 0 && r != -ENOENT) {
+    CLS_ERR("error reading mirror enabled flag: %s",
+            cpp_strerror(r).c_str());
+    return r;
+  }
+
+  if (r == 0) {
+    try {
+      bufferlist::iterator bl_it = bl.begin();
+      ::decode(*enabled, bl_it);
+    } catch (const buffer::error &err) {
+      CLS_ERR("could not decode flag");
+      return -EIO;
+    }
+  } else {
+    *enabled = false;
+  }
+  return 0;
+}
+
+int read_peers(cls_method_context_t hctx,
+               std::vector<cls::rbd::MirrorPeer> *peers) {
+  std::string last_read = PEER_KEY_PREFIX;
+  int max_read = RBD_MAX_KEYS_READ;
+  int r = max_read;
+  while (r == max_read) {
+    std::map<std::string, bufferlist> vals;
+    r = cls_cxx_map_get_vals(hctx, last_read, PEER_KEY_PREFIX.c_str(),
+			     max_read, &vals);
+    if (r < 0) {
+      CLS_ERR("error reading peers: %s", cpp_strerror(r).c_str());
+      return r;
+    }
+
+    for (auto &it : vals) {
+      try {
+        bufferlist::iterator bl_it = it.second.begin();
+        cls::rbd::MirrorPeer peer;
+	::decode(peer, bl_it);
+        peers->push_back(peer);
+      } catch (const buffer::error &err) {
+	CLS_ERR("could not decode peer '%s'", it.first.c_str());
+	return -EIO;
+      }
+    }
+  }
+  return 0;
+}
+
+int read_peer(cls_method_context_t hctx, const std::string uuid,
+              cls::rbd::MirrorPeer *peer) {
+  bufferlist bl;
+  int r = cls_cxx_map_get_val(hctx, peer_key(uuid), &bl);
+  if (r < 0) {
+    CLS_ERR("error reading peer '%s': %s", uuid.c_str(),
+            cpp_strerror(r).c_str());
+    return r;
+  }
+
+  try {
+    bufferlist::iterator bl_it = bl.begin();
+    ::decode(*peer, bl_it);
+  } catch (const buffer::error &err) {
+    CLS_ERR("could not decode peer '%s'", uuid.c_str());
+    return -EIO;
+  }
+  return 0;
+}
+
+int write_peer(cls_method_context_t hctx, const std::string uuid,
+               const cls::rbd::MirrorPeer &peer) {
+  bufferlist bl;
+  ::encode(peer, bl);
+
+  int r = cls_cxx_map_set_val(hctx, peer_key(uuid), &bl);
+  if (r < 0) {
+    CLS_ERR("error writing peer '%s': %s", uuid.c_str(),
+            cpp_strerror(r).c_str());
+    return r;
+  }
+  return 0;
+}
+
+} // namespace mirror
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * @param bool: true if enabled
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_is_enabled(cls_method_context_t hctx, bufferlist *in,
+                      bufferlist *out) {
+  bool enabled;
+  int r = mirror::is_enabled(hctx, &enabled);
+  if (r < 0) {
+    return r;
+  }
+
+  ::encode(enabled, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param enabled (bool)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_set_enabled(cls_method_context_t hctx, bufferlist *in,
+                       bufferlist *out) {
+  bool enabled;
+  try {
+    bufferlist::iterator bl_it = in->begin();
+    ::decode(enabled, bl_it);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  int r;
+  if (enabled) {
+    bufferlist bl;
+    ::encode(enabled, bl);
+
+    r = cls_cxx_map_set_val(hctx, "mirror_enabled", &bl);
+    if (r < 0) {
+      CLS_ERR("error enabling mirroring: %s", cpp_strerror(r).c_str());
+      return r;
+    }
+  } else {
+    std::vector<cls::rbd::MirrorPeer> peers;
+    int r = mirror::read_peers(hctx, &peers);
+    if (r < 0 && r != -ENOENT) {
+      return r;
+    }
+
+    if (!peers.empty()) {
+      CLS_ERR("mirroring peers still registered");
+      return -EBUSY;
+    }
+
+    r = cls_cxx_map_remove_key(hctx, "mirror_enabled");
+    if (r < 0 && r != -ENOENT) {
+      CLS_ERR("error disabling mirroring: %s", cpp_strerror(r).c_str());
+      return r;
+    }
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * @param std::vector<cls::rbd::MirrorPeer>: collection of peers
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_peer_list(cls_method_context_t hctx, bufferlist *in,
+                     bufferlist *out) {
+  std::vector<cls::rbd::MirrorPeer> peers;
+  int r = mirror::read_peers(hctx, &peers);
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+
+  ::encode(peers, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param mirror_peer (cls::rbd::MirrorPeer)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_peer_add(cls_method_context_t hctx, bufferlist *in,
+                    bufferlist *out) {
+  cls::rbd::MirrorPeer mirror_peer;
+  try {
+    bufferlist::iterator it = in->begin();
+    ::decode(mirror_peer, it);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  bool enabled;
+  int r = mirror::is_enabled(hctx, &enabled);
+  if (r < 0) {
+    return r;
+  }
+  if (!enabled) {
+    CLS_ERR("mirroring must be enabled on the pool");
+    return -EINVAL;
+  }
+
+  std::vector<cls::rbd::MirrorPeer> peers;
+  r = mirror::read_peers(hctx, &peers);
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+
+  for (auto const &peer : peers) {
+    if (peer.cluster_uuid == mirror_peer.cluster_uuid) {
+      CLS_ERR("peer cluster uuid '%s' alread exists",
+              peer.cluster_uuid.c_str());
+      return -EEXIST;
+    } else if (peer.cluster_name == mirror_peer.cluster_name) {
+      CLS_ERR("peer cluster name '%s' alread exists",
+              peer.cluster_name.c_str());
+      return -EEXIST;
+    }
+  }
+
+  bufferlist bl;
+  ::encode(mirror_peer, bl);
+  r = cls_cxx_map_set_val(hctx, mirror::peer_key(mirror_peer.cluster_uuid),
+                          &bl);
+  if (r < 0) {
+    CLS_ERR("error adding peer: %s", cpp_strerror(r).c_str());
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * @param cluster_uuid (std::string)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_peer_remove(cls_method_context_t hctx, bufferlist *in,
+                       bufferlist *out) {
+  std::string cluster_uuid;
+  try {
+    bufferlist::iterator it = in->begin();
+    ::decode(cluster_uuid, it);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  int r = cls_cxx_map_remove_key(hctx, mirror::peer_key(cluster_uuid));
+  if (r < 0 && r != -ENOENT) {
+    CLS_ERR("error removing peer: %s", cpp_strerror(r).c_str());
+    return r;
+  }
+  return 0;
+}
 
+/**
+ * Input:
+ * @param cluster_uuid (std::string)
+ * @param client_name (std::string)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_peer_set_client(cls_method_context_t hctx, bufferlist *in,
+                           bufferlist *out) {
+  std::string cluster_uuid;
+  std::string client_name;
+  try {
+    bufferlist::iterator it = in->begin();
+    ::decode(cluster_uuid, it);
+    ::decode(client_name, it);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  cls::rbd::MirrorPeer peer;
+  int r = mirror::read_peer(hctx, cluster_uuid, &peer);
+  if (r < 0) {
+    return r;
+  }
+
+  peer.client_name = client_name;
+  r = mirror::write_peer(hctx, cluster_uuid, peer);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * @param cluster_uuid (std::string)
+ * @param cluster_name (std::string)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int mirror_peer_set_cluster(cls_method_context_t hctx, bufferlist *in,
+                            bufferlist *out) {
+  std::string cluster_uuid;
+  std::string cluster_name;
+  try {
+    bufferlist::iterator it = in->begin();
+    ::decode(cluster_uuid, it);
+    ::decode(cluster_name, it);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  cls::rbd::MirrorPeer peer;
+  int r = mirror::read_peer(hctx, cluster_uuid, &peer);
+  if (r < 0) {
+    return r;
+  }
+
+  peer.cluster_name = cluster_name;
+  r = mirror::write_peer(hctx, cluster_uuid, peer);
+  if (r < 0) {
+    return r;
+  }
   return 0;
 }
 
@@ -3064,5 +3402,25 @@ void __cls_init()
 			  CLS_METHOD_RD | CLS_METHOD_WR,
 			  old_snapshot_rename, &h_old_snapshot_rename);
 
+  /* methods for the rbd_pool_settings object */
+  cls_register_cxx_method(h_class, "mirror_is_enabled", CLS_METHOD_RD,
+                          mirror_is_enabled, &h_mirror_is_enabled);
+  cls_register_cxx_method(h_class, "mirror_set_enabled",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          mirror_set_enabled, &h_mirror_set_enabled);
+  cls_register_cxx_method(h_class, "mirror_peer_list", CLS_METHOD_RD,
+                          mirror_peer_list, &h_mirror_peer_list);
+  cls_register_cxx_method(h_class, "mirror_peer_add",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          mirror_peer_add, &h_mirror_peer_add);
+  cls_register_cxx_method(h_class, "mirror_peer_remove",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          mirror_peer_remove, &h_mirror_peer_remove);
+  cls_register_cxx_method(h_class, "mirror_peer_set_client",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          mirror_peer_set_client, &h_mirror_peer_set_client);
+  cls_register_cxx_method(h_class, "mirror_peer_set_cluster",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          mirror_peer_set_cluster, &h_mirror_peer_set_cluster);
   return;
 }
diff --git a/src/cls/rbd/cls_rbd.h b/src/cls/rbd/cls_rbd.h
index 5f79d5a..710b542 100644
--- a/src/cls/rbd/cls_rbd.h
+++ b/src/cls/rbd/cls_rbd.h
@@ -4,7 +4,7 @@
 #define __CEPH_CLS_RBD_H
 
 #include "include/types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "common/Formatter.h"
 #include "librbd/parent_types.h"
 
diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc
index eec813d..d8199e5 100644
--- a/src/cls/rbd/cls_rbd_client.cc
+++ b/src/cls/rbd/cls_rbd_client.cc
@@ -1,61 +1,88 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include "cls/rbd/cls_rbd_client.h"
 #include "cls/lock/cls_lock_client.h"
 #include "include/buffer.h"
+#include "include/Context.h"
 #include "include/encoding.h"
 #include "include/rbd_types.h"
-
-#include "cls_rbd_client.h"
+#include "common/Cond.h"
 
 #include <errno.h>
 
 namespace librbd {
   namespace cls_client {
-    int get_immutable_metadata(librados::IoCtx *ioctx, const std::string &oid,
-			       std::string *object_prefix, uint8_t *order)
-    {
-      assert(object_prefix);
-      assert(order);
 
-      librados::ObjectReadOperation op;
-      bufferlist bl, empty;
+    void get_immutable_metadata_start(librados::ObjectReadOperation *op) {
+      bufferlist bl, empty_bl;
       snapid_t snap = CEPH_NOSNAP;
       ::encode(snap, bl);
-      op.exec("rbd", "get_size", bl);
-      op.exec("rbd", "get_object_prefix", empty);
-      
-
-      bufferlist outbl;
-      int r = ioctx->operate(oid, &op, &outbl);
-      if (r < 0)
-	return r;
+      op->exec("rbd", "get_size", bl);
+      op->exec("rbd", "get_object_prefix", empty_bl);
+    }
 
+    int get_immutable_metadata_finish(bufferlist::iterator *it,
+                                      std::string *object_prefix,
+                                      uint8_t *order) {
       try {
-	bufferlist::iterator iter = outbl.begin();
 	uint64_t size;
 	// get_size
-	::decode(*order, iter);
-	::decode(size, iter);
+	::decode(*order, *it);
+	::decode(size, *it);
 	// get_object_prefix
-	::decode(*object_prefix, iter);
+	::decode(*object_prefix, *it);
       } catch (const buffer::error &err) {
 	return -EBADMSG;
       }
-
       return 0;
+
     }
 
-    int get_mutable_metadata(librados::IoCtx *ioctx, const std::string &oid,
-			     bool read_only, uint64_t *size, uint64_t *features,
-			     uint64_t *incompatible_features,
-			     map<rados::cls::lock::locker_id_t,
-				 rados::cls::lock::locker_info_t> *lockers,
-                             bool *exclusive_lock,
-			     string *lock_tag,
-			     ::SnapContext *snapc,
-			     parent_info *parent)
+    int get_immutable_metadata(librados::IoCtx *ioctx, const std::string &oid,
+			       std::string *object_prefix, uint8_t *order)
     {
+      librados::ObjectReadOperation op;
+      get_immutable_metadata_start(&op);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return get_immutable_metadata_finish(&it, object_prefix, order);
+    }
+
+    void get_mutable_metadata_start(librados::ObjectReadOperation *op,
+                                    bool read_only) {
+      snapid_t snap = CEPH_NOSNAP;
+      bufferlist size_bl;
+      ::encode(snap, size_bl);
+      op->exec("rbd", "get_size", size_bl);
+
+      bufferlist features_bl;
+      ::encode(snap, features_bl);
+      ::encode(read_only, features_bl);
+      op->exec("rbd", "get_features", features_bl);
+
+      bufferlist empty_bl;
+      op->exec("rbd", "get_snapcontext", empty_bl);
+
+      bufferlist parent_bl;
+      ::encode(snap, parent_bl);
+      op->exec("rbd", "get_parent", parent_bl);
+      rados::cls::lock::get_lock_info_start(op, RBD_LOCK_NAME);
+    }
+
+    int get_mutable_metadata_finish(bufferlist::iterator *it,
+                                    uint64_t *size, uint64_t *features,
+                                    uint64_t *incompatible_features,
+                                    std::map<rados::cls::lock::locker_id_t,
+                                             rados::cls::lock::locker_info_t> *lockers,
+                                    bool *exclusive_lock, std::string *lock_tag,
+                                    ::SnapContext *snapc, parent_info *parent) {
       assert(size);
       assert(features);
       assert(incompatible_features);
@@ -64,63 +91,64 @@ namespace librbd {
       assert(snapc);
       assert(parent);
 
-      librados::ObjectReadOperation op;
-      bufferlist sizebl, featuresbl, parentbl, empty;
-      snapid_t snap = CEPH_NOSNAP;
-      ::encode(snap, sizebl);
-      op.exec("rbd", "get_size", sizebl);
-
-      ::encode(snap, featuresbl);
-      ::encode(read_only, featuresbl);
-      op.exec("rbd", "get_features", featuresbl);
-
-      op.exec("rbd", "get_snapcontext", empty);
-
-      ::encode(snap, parentbl);
-      op.exec("rbd", "get_parent", parentbl);
-      rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
-
-      bufferlist outbl;
-      int r = ioctx->operate(oid, &op, &outbl);
-      if (r < 0)
-	return r;
-
       try {
-	bufferlist::iterator iter = outbl.begin();
 	uint8_t order;
 	// get_size
-	::decode(order, iter);
-	::decode(*size, iter);
+	::decode(order, *it);
+	::decode(*size, *it);
 	// get_features
-	::decode(*features, iter);
-	::decode(*incompatible_features, iter);
+	::decode(*features, *it);
+	::decode(*incompatible_features, *it);
 	// get_snapcontext
-	::decode(*snapc, iter);
+	::decode(*snapc, *it);
 	// get_parent
-	::decode(parent->spec.pool_id, iter);
-	::decode(parent->spec.image_id, iter);
-	::decode(parent->spec.snap_id, iter);
-	::decode(parent->overlap, iter);
+	::decode(parent->spec.pool_id, *it);
+	::decode(parent->spec.image_id, *it);
+	::decode(parent->spec.snap_id, *it);
+	::decode(parent->overlap, *it);
 
 	// get_lock_info
 	ClsLockType lock_type = LOCK_NONE;
-	r = rados::cls::lock::get_lock_info_finish(&iter, lockers, &lock_type,
-						   lock_tag);
-
-	// see comment in ictx_refresh().  Ugly conflation of
-	// EOPNOTSUPP and EIO.
-
-	if (r < 0 && ((r != -EOPNOTSUPP) && (r != -EIO)))
-	  return r;
-
-	*exclusive_lock = (lock_type == LOCK_EXCLUSIVE);
+	int r = rados::cls::lock::get_lock_info_finish(it, lockers, &lock_type,
+						       lock_tag);
+        if (r == -EOPNOTSUPP) {
+          r = 0;
+        }
+        if (r == 0) {
+	  *exclusive_lock = (lock_type == LOCK_EXCLUSIVE);
+        }
       } catch (const buffer::error &err) {
 	return -EBADMSG;
       }
-
       return 0;
     }
 
+    int get_mutable_metadata(librados::IoCtx *ioctx, const std::string &oid,
+			     bool read_only, uint64_t *size, uint64_t *features,
+			     uint64_t *incompatible_features,
+			     map<rados::cls::lock::locker_id_t,
+				 rados::cls::lock::locker_info_t> *lockers,
+                             bool *exclusive_lock,
+			     string *lock_tag,
+			     ::SnapContext *snapc,
+			     parent_info *parent)
+    {
+      librados::ObjectReadOperation op;
+      get_mutable_metadata_start(&op, read_only);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return get_mutable_metadata_finish(&it, size, features,
+                                         incompatible_features, lockers,
+                                         exclusive_lock, lock_tag, snapc,
+                                         parent);
+    }
+
     int create_image(librados::IoCtx *ioctx, const std::string &oid,
 		     uint64_t size, uint8_t order, uint64_t features,
 		     const std::string &object_prefix)
@@ -256,35 +284,28 @@ namespace librbd {
       return ioctx->exec(oid, "rbd", "set_parent", inbl, outbl);
     }
 
-    int get_flags(librados::IoCtx *ioctx, const std::string &oid,
-		  uint64_t *flags, const std::vector<snapid_t> &snap_ids,
-		  vector<uint64_t> *snap_flags)
-    {
-      bufferlist inbl;
-      ::encode(static_cast<snapid_t>(CEPH_NOSNAP), inbl);
+    void get_flags_start(librados::ObjectReadOperation *op,
+                         const std::vector<snapid_t> &snap_ids) {
+      bufferlist in_bl;
+      ::encode(static_cast<snapid_t>(CEPH_NOSNAP), in_bl);
 
-      librados::ObjectReadOperation op;
-      op.exec("rbd", "get_flags", inbl);
+      op->exec("rbd", "get_flags", in_bl);
       for (size_t i = 0; i < snap_ids.size(); ++i) {
-	bufferlist snapbl;
-	::encode(snap_ids[i], snapbl);
-	op.exec("rbd", "get_flags", snapbl);
+        bufferlist snap_bl;
+        ::encode(snap_ids[i], snap_bl);
+        op->exec("rbd", "get_flags", snap_bl);
       }
 
-      snap_flags->clear();
-      snap_flags->resize(snap_ids.size());
-
-      bufferlist outbl;
-      int r = ioctx->operate(oid, &op, &outbl);
-      if (r < 0) {
-        return r;
-      }
+    }
 
+    int get_flags_finish(bufferlist::iterator *it, uint64_t *flags,
+                         const std::vector<snapid_t> &snap_ids,
+                         std::vector<uint64_t> *snap_flags) {
+      snap_flags->resize(snap_ids.size());
       try {
-        bufferlist::iterator iter = outbl.begin();
-        ::decode(*flags, iter);
-	for (size_t i = 0; i < snap_ids.size(); ++i) {
-	  ::decode((*snap_flags)[i], iter);
+        ::decode(*flags, *it);
+	for (size_t i = 0; i < snap_flags->size(); ++i) {
+	  ::decode((*snap_flags)[i], *it);
 	}
       } catch (const buffer::error &err) {
         return -EBADMSG;
@@ -292,6 +313,23 @@ namespace librbd {
       return 0;
     }
 
+    int get_flags(librados::IoCtx *ioctx, const std::string &oid,
+		  uint64_t *flags, const std::vector<snapid_t> &snap_ids,
+		  vector<uint64_t> *snap_flags)
+    {
+      librados::ObjectReadOperation op;
+      get_flags_start(&op, snap_ids);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return get_flags_finish(&it, flags, snap_ids, snap_flags);
+    }
+
     void set_flags(librados::ObjectWriteOperation *op, snapid_t snap_id,
                    uint64_t flags, uint64_t mask)
     {
@@ -346,32 +384,39 @@ namespace librbd {
       return ioctx->operate(oid, &op);
     }
 
-    int get_children(librados::IoCtx *ioctx, const std::string &oid,
-		     parent_spec pspec, set<string>& children)
-    {
-      bufferlist in, out;
-      ::encode(pspec.pool_id, in);
-      ::encode(pspec.image_id, in);
-      ::encode(pspec.snap_id, in);
+    void get_children_start(librados::ObjectReadOperation *op,
+                            const parent_spec &pspec) {
+      bufferlist in_bl;
+      ::encode(pspec.pool_id, in_bl);
+      ::encode(pspec.image_id, in_bl);
+      ::encode(pspec.snap_id, in_bl);
+      op->exec("rbd", "get_children", in_bl);
+    }
 
-      int r = ioctx->exec(oid, "rbd", "get_children", in, out);
-      if (r < 0)
-	return r;
-      bufferlist::iterator it = out.begin();
+    int get_children_finish(bufferlist::iterator *it,
+                            std::set<std::string>* children) {
       try {
-	::decode(children, it);
+        ::decode(*children, *it);
       } catch (const buffer::error &err) {
-	return -EBADMSG;
+        return -EBADMSG;
       }
       return 0;
     }
 
-    int snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
-		     snapid_t snap_id, const std::string &snap_name)
+    int get_children(librados::IoCtx *ioctx, const std::string &oid,
+		     parent_spec pspec, set<string>& children)
     {
-      librados::ObjectWriteOperation op;
-      snapshot_add(&op, snap_id, snap_name);
-      return ioctx->operate(oid, &op);
+      librados::ObjectReadOperation op;
+      get_children_start(&op, pspec);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return get_children_finish(&it, &children);
     }
 
     void snapshot_add(librados::ObjectWriteOperation *op, snapid_t snap_id,
@@ -383,23 +428,13 @@ namespace librbd {
       op->exec("rbd", "snapshot_add", bl);
     }
 
-    int snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
-			snapid_t snap_id)
+    void snapshot_remove(librados::ObjectWriteOperation *op, snapid_t snap_id)
     {
-      bufferlist bl, bl2;
+      bufferlist bl;
       ::encode(snap_id, bl);
-
-      return ioctx->exec(oid, "rbd", "snapshot_remove", bl, bl2);
+      op->exec("rbd", "snapshot_remove", bl);
     }
 
-    int snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
-			 snapid_t src_snap_id,
-		         const std::string &dst_name)
-    {
-      librados::ObjectWriteOperation op;
-      snapshot_rename(&op, src_snap_id, dst_name);
-      return ioctx->operate(oid, &op);
-    }
     void snapshot_rename(librados::ObjectWriteOperation *op,
 			 snapid_t src_snap_id,
 		         const std::string &dst_name)
@@ -409,6 +444,7 @@ namespace librbd {
       ::encode(dst_name, bl);
       op->exec("rbd", "snapshot_rename", bl);
     }
+
     int get_snapcontext(librados::IoCtx *ioctx, const std::string &oid,
 			::SnapContext *snapc)
     {
@@ -431,127 +467,148 @@ namespace librbd {
       return 0;
     }
 
-    int snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
-		      const std::vector<snapid_t> &ids,
-		      std::vector<string> *names,
-		      std::vector<uint64_t> *sizes,
-		      std::vector<parent_info> *parents,
-		      std::vector<uint8_t> *protection_statuses)
-    {
-      names->clear();
+    void snapshot_list_start(librados::ObjectReadOperation *op,
+                             const std::vector<snapid_t> &ids) {
+      for (vector<snapid_t>::const_iterator it = ids.begin();
+           it != ids.end(); ++it) {
+        snapid_t snap_id = it->val;
+        bufferlist bl1, bl2, bl3, bl4;
+        ::encode(snap_id, bl1);
+        op->exec("rbd", "get_snapshot_name", bl1);
+        ::encode(snap_id, bl2);
+        op->exec("rbd", "get_size", bl2);
+        ::encode(snap_id, bl3);
+        op->exec("rbd", "get_parent", bl3);
+        ::encode(snap_id, bl4);
+        op->exec("rbd", "get_protection_status", bl4);
+      }
+    }
+
+    int snapshot_list_finish(bufferlist::iterator *it,
+                             const std::vector<snapid_t> &ids,
+                             std::vector<string> *names,
+                             std::vector<uint64_t> *sizes,
+                             std::vector<parent_info> *parents,
+                             std::vector<uint8_t> *protection_statuses) {
       names->resize(ids.size());
-      sizes->clear();
       sizes->resize(ids.size());
-      parents->clear();
       parents->resize(ids.size());
-      protection_statuses->clear();
       protection_statuses->resize(ids.size());
-
-      librados::ObjectReadOperation op;
-      for (vector<snapid_t>::const_iterator it = ids.begin();
-	   it != ids.end(); ++it) {
-	snapid_t snap_id = it->val;
-	bufferlist bl1, bl2, bl3, bl4;
-	::encode(snap_id, bl1);
-	op.exec("rbd", "get_snapshot_name", bl1);
-	::encode(snap_id, bl2);
-	op.exec("rbd", "get_size", bl2);
-	::encode(snap_id, bl3);
-	op.exec("rbd", "get_parent", bl3);
-	::encode(snap_id, bl4);
-	op.exec("rbd", "get_protection_status", bl4);
-      }
-
-      bufferlist outbl;
-      int r = ioctx->operate(oid, &op, &outbl);
-      if (r < 0)
-	return r;
-
       try {
-	bufferlist::iterator iter = outbl.begin();
-	for (size_t i = 0; i < ids.size(); ++i) {
+	for (size_t i = 0; i < names->size(); ++i) {
 	  uint8_t order;
 	  // get_snapshot_name
-	  ::decode((*names)[i], iter);
+	  ::decode((*names)[i], *it);
 	  // get_size
-	  ::decode(order, iter);
-	  ::decode((*sizes)[i], iter);
+	  ::decode(order, *it);
+	  ::decode((*sizes)[i], *it);
 	  // get_parent
-	  ::decode((*parents)[i].spec.pool_id, iter);
-	  ::decode((*parents)[i].spec.image_id, iter);
-	  ::decode((*parents)[i].spec.snap_id, iter);
-	  ::decode((*parents)[i].overlap, iter);
+	  ::decode((*parents)[i].spec.pool_id, *it);
+	  ::decode((*parents)[i].spec.image_id, *it);
+	  ::decode((*parents)[i].spec.snap_id, *it);
+	  ::decode((*parents)[i].overlap, *it);
 	  // get_protection_status
-	  ::decode((*protection_statuses)[i], iter);
+	  ::decode((*protection_statuses)[i], *it);
 	}
       } catch (const buffer::error &err) {
-	return -EBADMSG;
+        return -EBADMSG;
       }
-
       return 0;
     }
 
-    int old_snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
-			 snapid_t snap_id, const std::string &snap_name)
+    int snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
+		      const std::vector<snapid_t> &ids,
+		      std::vector<string> *names,
+		      std::vector<uint64_t> *sizes,
+		      std::vector<parent_info> *parents,
+		      std::vector<uint8_t> *protection_statuses)
     {
-      bufferlist bl, bl2;
+      librados::ObjectReadOperation op;
+      snapshot_list_start(&op, ids);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return snapshot_list_finish(&it, ids, names, sizes, parents,
+                                  protection_statuses);
+    }
+
+    void old_snapshot_add(librados::ObjectWriteOperation *op,
+			  snapid_t snap_id, const std::string &snap_name)
+    {
+      bufferlist bl;
       ::encode(snap_name, bl);
       ::encode(snap_id, bl);
+      op->exec("rbd", "snap_add", bl);
+    }
 
-      return ioctx->exec(oid, "rbd", "snap_add", bl, bl2);
+    void old_snapshot_remove(librados::ObjectWriteOperation *op,
+			     const std::string &snap_name)
+    {
+      bufferlist bl;
+      ::encode(snap_name, bl);
+      op->exec("rbd", "snap_remove", bl);
     }
 
-    int old_snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
-			    snapid_t src_snap_id ,
-			    const std::string &dst_name)
+    void old_snapshot_rename(librados::ObjectWriteOperation *op,
+			     snapid_t src_snap_id, const std::string &dst_name)
     {
-      bufferlist bl, bl2;
+      bufferlist bl;
       ::encode(src_snap_id, bl);
       ::encode(dst_name, bl);
-
-      return ioctx->exec(oid, "rbd", "snap_rename", bl, bl2);
+      op->exec("rbd", "snap_rename", bl);
     }
-    int old_snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
-			    const std::string &snap_name)
-    {
-      bufferlist bl, bl2;
-      ::encode(snap_name, bl);
 
-      return ioctx->exec(oid, "rbd", "snap_remove", bl, bl2);
+    void old_snapshot_list_start(librados::ObjectReadOperation *op) {
+      bufferlist in_bl;
+      op->exec("rbd", "snap_list", in_bl);
     }
 
-    int old_snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
-			  std::vector<string> *names,
-			  std::vector<uint64_t> *sizes,
-			  ::SnapContext *snapc)
-    {
-      bufferlist bl, outbl;
-      int r = ioctx->exec(oid, "rbd", "snap_list", bl, outbl);
-      if (r < 0)
-	return r;
-
-      bufferlist::iterator iter = outbl.begin();
+    int old_snapshot_list_finish(bufferlist::iterator *it,
+                                 std::vector<string> *names,
+                                 std::vector<uint64_t> *sizes,
+                                 ::SnapContext *snapc) {
       try {
 	uint32_t num_snaps;
-	::decode(snapc->seq, iter);
-	::decode(num_snaps, iter);
+	::decode(snapc->seq, *it);
+	::decode(num_snaps, *it);
 
 	names->resize(num_snaps);
 	sizes->resize(num_snaps);
 	snapc->snaps.resize(num_snaps);
-
 	for (uint32_t i = 0; i < num_snaps; ++i) {
-	  ::decode(snapc->snaps[i], iter);
-	  ::decode((*sizes)[i], iter);
-	  ::decode((*names)[i], iter);
+	  ::decode(snapc->snaps[i], *it);
+	  ::decode((*sizes)[i], *it);
+	  ::decode((*names)[i], *it);
 	}
       } catch (const buffer::error &err) {
-	return -EBADMSG;
+        return -EBADMSG;
       }
-
       return 0;
     }
 
+    int old_snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
+			  std::vector<string> *names,
+			  std::vector<uint64_t> *sizes,
+			  ::SnapContext *snapc)
+    {
+      librados::ObjectReadOperation op;
+      old_snapshot_list_start(&op);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return old_snapshot_list_finish(&it, names, sizes, snapc);
+    }
+
     int copyup(librados::IoCtx *ioctx, const std::string &oid,
 	       bufferlist data) {
       bufferlist out;
@@ -581,38 +638,57 @@ namespace librbd {
     int set_protection_status(librados::IoCtx *ioctx, const std::string &oid,
 			      snapid_t snap_id, uint8_t protection_status)
     {
-      bufferlist in, out;
+      // TODO remove
+      librados::ObjectWriteOperation op;
+      set_protection_status(&op, snap_id, protection_status);
+      return ioctx->operate(oid, &op);
+    }
+
+    void set_protection_status(librados::ObjectWriteOperation *op,
+                               snapid_t snap_id, uint8_t protection_status)
+    {
+      bufferlist in;
       ::encode(snap_id, in);
       ::encode(protection_status, in);
-      return ioctx->exec(oid, "rbd", "set_protection_status", in, out);
+      op->exec("rbd", "set_protection_status", in);
     }
 
-    int get_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid,
-			      uint64_t *stripe_unit, uint64_t *stripe_count)
-    {
+    void get_stripe_unit_count_start(librados::ObjectReadOperation *op) {
+      bufferlist empty_bl;
+      op->exec("rbd", "get_stripe_unit_count", empty_bl);
+    }
+
+    int get_stripe_unit_count_finish(bufferlist::iterator *it,
+                                     uint64_t *stripe_unit,
+                                     uint64_t *stripe_count) {
       assert(stripe_unit);
       assert(stripe_count);
 
-      librados::ObjectReadOperation op;
-      bufferlist empty;
-      op.exec("rbd", "get_stripe_unit_count", empty);
-
-      bufferlist outbl;
-      int r = ioctx->operate(oid, &op, &outbl);
-      if (r < 0)
-	return r;
-
       try {
-	bufferlist::iterator iter = outbl.begin();
-	::decode(*stripe_unit, iter);
-	::decode(*stripe_count, iter);
+	::decode(*stripe_unit, *it);
+	::decode(*stripe_count, *it);
       } catch (const buffer::error &err) {
 	return -EBADMSG;
       }
-
       return 0;
     }
 
+    int get_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid,
+			      uint64_t *stripe_unit, uint64_t *stripe_count)
+    {
+      librados::ObjectReadOperation op;
+      get_stripe_unit_count_start(&op);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return get_stripe_unit_count_finish(&it, stripe_unit, stripe_count);
+    }
+
     int set_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid,
 			      uint64_t stripe_unit, uint64_t stripe_count)
     {
@@ -625,23 +701,35 @@ namespace librbd {
 
     /************************ rbd_id object methods ************************/
 
-    int get_id(librados::IoCtx *ioctx, const std::string &oid, std::string *id)
-    {
-      bufferlist in, out;
-      int r = ioctx->exec(oid, "rbd", "get_id", in, out);
-      if (r < 0)
-	return r;
+    void get_id_start(librados::ObjectReadOperation *op) {
+      bufferlist empty_bl;
+      op->exec("rbd", "get_id", empty_bl);
+    }
 
-      bufferlist::iterator iter = out.begin();
+    int get_id_finish(bufferlist::iterator *it, std::string *id) {
       try {
-	::decode(*id, iter);
+	::decode(*id, *it);
       } catch (const buffer::error &err) {
 	return -EBADMSG;
       }
-
       return 0;
     }
 
+    int get_id(librados::IoCtx *ioctx, const std::string &oid, std::string *id)
+    {
+      librados::ObjectReadOperation op;
+      get_id_start(&op);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      bufferlist::iterator it = out_bl.begin();
+      return get_id_finish(&it, id);
+    }
+
     int set_id(librados::IoCtx *ioctx, const std::string &oid, std::string id)
     {
       bufferlist in, out;
@@ -728,34 +816,46 @@ namespace librbd {
       return ioctx->exec(oid, "rbd", "dir_remove_image", in, out);
     }
 
-    int dir_rename_image(librados::IoCtx *ioctx, const std::string &oid,
+    void dir_rename_image(librados::ObjectWriteOperation *op,
 			 const std::string &src, const std::string &dest,
 			 const std::string &id)
     {
-      bufferlist in, out;
+      bufferlist in;
       ::encode(src, in);
       ::encode(dest, in);
       ::encode(id, in);
-      return ioctx->exec(oid, "rbd", "dir_rename_image", in, out);
+      op->exec("rbd", "dir_rename_image", in);
+    }
+
+    void object_map_load_start(librados::ObjectReadOperation *op) {
+      bufferlist in_bl;
+      op->exec("rbd", "object_map_load", in_bl);
+    }
+
+    int object_map_load_finish(bufferlist::iterator *it,
+                               ceph::BitVector<2> *object_map) {
+      try {
+        ::decode(*object_map, *it);
+      } catch (const buffer::error &err) {
+        return -EBADMSG;
+      }
+      return 0;
     }
 
     int object_map_load(librados::IoCtx *ioctx, const std::string &oid,
 			ceph::BitVector<2> *object_map)
     {
-      bufferlist in;
-      bufferlist out;
-      int r = ioctx->exec(oid, "rbd", "object_map_load", in, out);
+      librados::ObjectReadOperation op;
+      object_map_load_start(&op);
+
+      bufferlist out_bl;
+      int r = ioctx->operate(oid, &op, &out_bl);
       if (r < 0) {
 	return r;
       }
 
-      try {
-        bufferlist::iterator iter = out.begin();
-        ::decode(*object_map, iter);
-      } catch (const buffer::error &err) {
-        return -EBADMSG;
-      }
-      return 0;
+      bufferlist::iterator it = out_bl.begin();
+      return object_map_load_finish(&it, object_map);
     }
 
     void object_map_save(librados::ObjectWriteOperation *rados_op,
@@ -868,5 +968,121 @@ namespace librbd {
       return 0;
     }
 
+    int mirror_is_enabled(librados::IoCtx *ioctx, bool *enabled) {
+      bufferlist in_bl;
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_is_enabled", in_bl,
+                          out_bl);
+      if (r == -ENOENT) {
+        *enabled = false;
+        return 0;
+      } else if (r < 0) {
+        return r;
+      }
+
+      try {
+        bufferlist::iterator bl_it = out_bl.begin();
+        ::decode(*enabled, bl_it);
+      } catch (const buffer::error &err) {
+        return -EBADMSG;
+      }
+      return 0;
+    }
+
+    int mirror_set_enabled(librados::IoCtx *ioctx, bool enabled) {
+      bufferlist in_bl;
+      ::encode(enabled, in_bl);
+
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_set_enabled", in_bl,
+                          out_bl);
+      if (r < 0) {
+        return r;
+      }
+      return 0;
+    }
+
+    int mirror_peer_list(librados::IoCtx *ioctx,
+                         std::vector<cls::rbd::MirrorPeer> *peers) {
+      bufferlist in_bl;
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_peer_list", in_bl,
+                          out_bl);
+      if (r < 0) {
+        return r;
+      }
+
+      peers->clear();
+      try {
+        bufferlist::iterator bl_it = out_bl.begin();
+        ::decode(*peers, bl_it);
+      } catch (const buffer::error &err) {
+        return -EBADMSG;
+      }
+      return 0;
+    }
+
+    int mirror_peer_add(librados::IoCtx *ioctx, const std::string &cluster_uuid,
+                        const std::string &cluster_name,
+                        const std::string &client_name) {
+      cls::rbd::MirrorPeer peer(cluster_uuid, cluster_name, client_name);
+      bufferlist in_bl;
+      ::encode(peer, in_bl);
+
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_peer_add", in_bl,
+                          out_bl);
+      if (r < 0) {
+        return r;
+      }
+      return 0;
+    }
+
+    int mirror_peer_remove(librados::IoCtx *ioctx,
+                           const std::string &cluster_uuid) {
+      bufferlist in_bl;
+      ::encode(cluster_uuid, in_bl);
+
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_peer_remove", in_bl,
+                          out_bl);
+      if (r < 0) {
+        return r;
+      }
+      return 0;
+    }
+
+    int mirror_peer_set_client(librados::IoCtx *ioctx,
+                               const std::string &cluster_uuid,
+                               const std::string &client_name) {
+      bufferlist in_bl;
+      ::encode(cluster_uuid, in_bl);
+      ::encode(client_name, in_bl);
+
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_peer_set_client",
+                          in_bl, out_bl);
+      if (r < 0) {
+        return r;
+      }
+      return 0;
+    }
+
+    int mirror_peer_set_cluster(librados::IoCtx *ioctx,
+                                const std::string &cluster_uuid,
+                                const std::string &cluster_name) {
+      bufferlist in_bl;
+      ::encode(cluster_uuid, in_bl);
+      ::encode(cluster_name, in_bl);
+
+      bufferlist out_bl;
+      int r = ioctx->exec(RBD_POOL_SETTINGS, "rbd", "mirror_peer_set_cluster",
+                          in_bl, out_bl);
+      if (r < 0) {
+        return r;
+      }
+      return 0;
+    }
+
   } // namespace cls_client
 } // namespace librbd
diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h
index 6235fea..928158a 100644
--- a/src/cls/rbd/cls_rbd_client.h
+++ b/src/cls/rbd/cls_rbd_client.h
@@ -5,6 +5,7 @@
 #define CEPH_LIBRBD_CLS_RBD_CLIENT_H
 
 #include "cls/lock/cls_lock_types.h"
+#include "cls/rbd/cls_rbd_types.h"
 #include "common/bit_vector.hpp"
 #include "common/snap_types.h"
 #include "include/rados/librados.hpp"
@@ -14,11 +15,27 @@
 #include <string>
 #include <vector>
 
+class Context;
+
 namespace librbd {
   namespace cls_client {
     // high-level interface to the header
+    void get_immutable_metadata_start(librados::ObjectReadOperation *op);
+    int get_immutable_metadata_finish(bufferlist::iterator *it,
+                                      std::string *object_prefix,
+                                      uint8_t *order);
     int get_immutable_metadata(librados::IoCtx *ioctx, const std::string &oid,
 			       std::string *object_prefix, uint8_t *order);
+
+    void get_mutable_metadata_start(librados::ObjectReadOperation *op,
+                                    bool read_only);
+    int get_mutable_metadata_finish(bufferlist::iterator *it,
+                                    uint64_t *size, uint64_t *features,
+                                    uint64_t *incompatible_features,
+                                    std::map<rados::cls::lock::locker_id_t,
+                                             rados::cls::lock::locker_info_t> *lockers,
+                                    bool *exclusive_lock, std::string *lock_tag,
+                                    ::SnapContext *snapc, parent_info *parent);
     int get_mutable_metadata(librados::IoCtx *ioctx, const std::string &oid,
 			     bool read_only, uint64_t *size, uint64_t *features,
 			     uint64_t *incompatible_features,
@@ -49,6 +66,11 @@ namespace librbd {
 		   uint64_t *parent_overlap);
     int set_parent(librados::IoCtx *ioctx, const std::string &oid,
 		   parent_spec pspec, uint64_t parent_overlap);
+    void get_flags_start(librados::ObjectReadOperation *op,
+                         const std::vector<snapid_t> &snap_ids);
+    int get_flags_finish(bufferlist::iterator *it, uint64_t *flags,
+                         const std::vector<snapid_t> &snap_ids,
+                         std::vector<uint64_t> *snap_flags);
     int get_flags(librados::IoCtx *ioctx, const std::string &oid,
 		  uint64_t *flags, const std::vector<snapid_t> &snap_ids,
 		  vector<uint64_t> *snap_flags);
@@ -62,36 +84,52 @@ namespace librbd {
 		      parent_spec pspec, const std::string &c_imageid);
     int remove_child(librados::IoCtx *ioctx, const std::string &oid,
 		     parent_spec pspec, const std::string &c_imageid);
+    void get_children_start(librados::ObjectReadOperation *op,
+                            const parent_spec &pspec);
+    int get_children_finish(bufferlist::iterator *it,
+                            std::set<string> *children);
     int get_children(librados::IoCtx *ioctx, const std::string &oid,
-		     parent_spec pspec, set<string>& children);
-    int snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
-		     snapid_t snap_id, const std::string &snap_name);
+                      parent_spec pspec, set<string>& children);
     void snapshot_add(librados::ObjectWriteOperation *op, snapid_t snap_id,
 		      const std::string &snap_name);
-    int snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
-			snapid_t snap_id);
-    int snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
-			snapid_t src_snap_id,
-			const std::string &dst_name);
+    void snapshot_remove(librados::ObjectWriteOperation *op, snapid_t snap_id);
     void snapshot_rename(librados::ObjectWriteOperation *op,
 			snapid_t src_snap_id,
 			const std::string &dst_name);
     int get_snapcontext(librados::IoCtx *ioctx, const std::string &oid,
 			::SnapContext *snapc);
+
+    void snapshot_list_start(librados::ObjectReadOperation *op,
+                             const std::vector<snapid_t> &ids);
+    int snapshot_list_finish(bufferlist::iterator *it,
+                             const std::vector<snapid_t> &ids,
+                             std::vector<string> *names,
+                             std::vector<uint64_t> *sizes,
+                             std::vector<parent_info> *parents,
+                             std::vector<uint8_t> *protection_statuses);
     int snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
 		      const std::vector<snapid_t> &ids,
 		      std::vector<string> *names,
 		      std::vector<uint64_t> *sizes,
 		      std::vector<parent_info> *parents,
 		      std::vector<uint8_t> *protection_statuses);
+
     int copyup(librados::IoCtx *ioctx, const std::string &oid,
 	       bufferlist data);
     int get_protection_status(librados::IoCtx *ioctx, const std::string &oid,
 			      snapid_t snap_id, uint8_t *protection_status);
     int set_protection_status(librados::IoCtx *ioctx, const std::string &oid,
 			      snapid_t snap_id, uint8_t protection_status);
+    void set_protection_status(librados::ObjectWriteOperation *op,
+                               snapid_t snap_id, uint8_t protection_status);
+
+    void get_stripe_unit_count_start(librados::ObjectReadOperation *op);
+    int get_stripe_unit_count_finish(bufferlist::iterator *it,
+                                     uint64_t *stripe_unit,
+                                     uint64_t *stripe_count);
     int get_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid,
 			      uint64_t *stripe_unit, uint64_t *stripe_count);
+
     int set_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid,
 			      uint64_t stripe_unit, uint64_t stripe_count);
     int metadata_list(librados::IoCtx *ioctx, const std::string &oid,
@@ -105,7 +143,10 @@ namespace librbd {
                      const std::string &key, string *v);
 
     // operations on rbd_id objects
+    void get_id_start(librados::ObjectReadOperation *op);
+    int get_id_finish(bufferlist::iterator *it, std::string *id);
     int get_id(librados::IoCtx *ioctx, const std::string &oid, std::string *id);
+
     int set_id(librados::IoCtx *ioctx, const std::string &oid, std::string id);
 
     // operations on rbd_directory objects
@@ -121,11 +162,14 @@ namespace librbd {
     int dir_remove_image(librados::IoCtx *ioctx, const std::string &oid,
 			 const std::string &name, const std::string &id);
     // atomic remove and add
-    int dir_rename_image(librados::IoCtx *ioctx, const std::string &oid,
-			 const std::string &src, const std::string &dest,
-			 const std::string &id);
+    void dir_rename_image(librados::ObjectWriteOperation *op,
+			  const std::string &src, const std::string &dest,
+			  const std::string &id);
 
     // operations on the rbd_object_map.$image_id object
+    void object_map_load_start(librados::ObjectReadOperation *op);
+    int object_map_load_finish(bufferlist::iterator *it,
+                               ceph::BitVector<2> *object_map);
     int object_map_load(librados::IoCtx *ioctx, const std::string &oid,
 		        ceph::BitVector<2> *object_map);
     void object_map_save(librados::ObjectWriteOperation *rados_op,
@@ -142,17 +186,40 @@ namespace librbd {
 
     // class operations on the old format, kept for
     // backwards compatability
-    int old_snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
-			 snapid_t snap_id, const std::string &snap_name);
-    int old_snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
+    void old_snapshot_add(librados::ObjectWriteOperation *rados_op,
+                          snapid_t snap_id, const std::string &snap_name);
+    void old_snapshot_remove(librados::ObjectWriteOperation *rados_op,
 			    const std::string &snap_name);
+    void old_snapshot_rename(librados::ObjectWriteOperation *rados_op,
+			     snapid_t src_snap_id, const std::string &dst_name);
+
+    void old_snapshot_list_start(librados::ObjectReadOperation *op);
+    int old_snapshot_list_finish(bufferlist::iterator *it,
+                                 std::vector<string> *names,
+                                 std::vector<uint64_t> *sizes,
+                                 ::SnapContext *snapc);
     int old_snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
 			  std::vector<string> *names,
 			  std::vector<uint64_t> *sizes,
 			  ::SnapContext *snapc);
-    int old_snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
-			    snapid_t src_snap_id,
-			    const std::string &dst_name);
+
+    // operations on the rbd_pool_settings object
+    int mirror_is_enabled(librados::IoCtx *ioctx, bool *enabled);
+    int mirror_set_enabled(librados::IoCtx *ioctx, bool enabled);
+    int mirror_peer_list(librados::IoCtx *ioctx,
+                         std::vector<cls::rbd::MirrorPeer> *peers);
+    int mirror_peer_add(librados::IoCtx *ioctx, const std::string &cluster_uuid,
+                        const std::string &cluster_name,
+                        const std::string &client_name);
+    int mirror_peer_remove(librados::IoCtx *ioctx,
+                           const std::string &cluster_uuid);
+    int mirror_peer_set_client(librados::IoCtx *ioctx,
+                               const std::string &cluster_uuid,
+                               const std::string &client_name);
+    int mirror_peer_set_cluster(librados::IoCtx *ioctx,
+                                const std::string &cluster_uuid,
+                                const std::string &cluster_name);
+
   } // namespace cls_client
 } // namespace librbd
 #endif // CEPH_LIBRBD_CLS_RBD_CLIENT_H
diff --git a/src/cls/rbd/cls_rbd_types.cc b/src/cls/rbd/cls_rbd_types.cc
new file mode 100644
index 0000000..dca84c5
--- /dev/null
+++ b/src/cls/rbd/cls_rbd_types.cc
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/Formatter.h"
+
+namespace cls {
+namespace rbd {
+
+void MirrorPeer::encode(bufferlist &bl) const {
+  ENCODE_START(1, 1, bl);
+  ::encode(cluster_uuid, bl);
+  ::encode(cluster_name, bl);
+  ::encode(client_name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MirrorPeer::decode(bufferlist::iterator &it) {
+  DECODE_START(1, it);
+  ::decode(cluster_uuid, it);
+  ::decode(cluster_name, it);
+  ::decode(client_name, it);
+  DECODE_FINISH(it);
+}
+
+void MirrorPeer::dump(Formatter *f) const {
+  f->dump_string("cluster_uuid", cluster_uuid);
+  f->dump_string("cluster_name", cluster_name);
+  f->dump_string("client_name", client_name);
+}
+
+void MirrorPeer::generate_test_instances(std::list<MirrorPeer*> &o) {
+  o.push_back(new MirrorPeer());
+  o.push_back(new MirrorPeer("uuid-123", "cluster name", "client name"));
+}
+
+bool MirrorPeer::operator==(const MirrorPeer &rhs) const {
+  return (cluster_uuid == rhs.cluster_uuid &&
+          cluster_name == rhs.cluster_name &&
+          client_name == rhs.client_name);
+}
+
+std::ostream& operator<<(std::ostream& os, const MirrorPeer& peer) {
+  os << "["
+     << "cluster_uuid=" << peer.cluster_uuid << ", "
+     << "cluster_name=" << peer.cluster_name << ", "
+     << "client_name=" << peer.client_name << "]";
+  return os;
+}
+
+} // namespace rbd
+} // namespace cls
diff --git a/src/cls/rbd/cls_rbd_types.h b/src/cls/rbd/cls_rbd_types.h
new file mode 100644
index 0000000..a4564ce
--- /dev/null
+++ b/src/cls/rbd/cls_rbd_types.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLS_RBD_TYPES_H
+#define CEPH_CLS_RBD_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include <iosfwd>
+#include <string>
+
+namespace ceph { class Formatter; }
+
+namespace cls {
+namespace rbd {
+
+struct MirrorPeer {
+  MirrorPeer() {
+  }
+  MirrorPeer(const std::string &cluster_uuid, const std::string &cluster_name,
+             const std::string &client_name)
+    : cluster_uuid(cluster_uuid), cluster_name(cluster_name),
+      client_name(client_name) {
+  }
+
+  std::string cluster_uuid;
+  std::string cluster_name;
+  std::string client_name;
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<MirrorPeer*> &o);
+
+  bool operator==(const MirrorPeer &rhs) const;
+};
+
+std::ostream& operator<<(std::ostream& os, const MirrorPeer& peer);
+
+WRITE_CLASS_ENCODER(MirrorPeer);
+
+} // namespace rbd
+} // namespace cls
+
+using cls::rbd::encode;
+using cls::rbd::decode;
+
+#endif // CEPH_CLS_RBD_TYPES_H
diff --git a/src/common/BackTrace.h b/src/common/BackTrace.h
index b5d1e1d..4157f74 100644
--- a/src/common/BackTrace.h
+++ b/src/common/BackTrace.h
@@ -1,8 +1,11 @@
 #ifndef CEPH_BACKTRACE_H
 #define CEPH_BACKTRACE_H
 
+#include "acconfig.h"
 #include <iosfwd>
+#ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
+#endif
 #include <stdlib.h>
 
 namespace ceph {
@@ -16,8 +19,14 @@ struct BackTrace {
   char **strings;
 
   BackTrace(int s) : skip(s) {
+#ifdef HAVE_EXECINFO_H
     size = backtrace(array, max);
     strings = backtrace_symbols(array, size);
+#else
+    skip = 0;
+    size = 0;
+    strings = nullptr;
+#endif
   }
   ~BackTrace() {
     free(strings);
diff --git a/src/common/ConfUtils.h b/src/common/ConfUtils.h
index e816f68..7a5b79a 100644
--- a/src/common/ConfUtils.h
+++ b/src/common/ConfUtils.h
@@ -20,7 +20,7 @@
 #include <set>
 #include <string>
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 
 /*
  * Ceph configuration file support.
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
index 7ebbe05..9b39dc8 100644
--- a/src/common/Finisher.cc
+++ b/src/common/Finisher.cc
@@ -44,9 +44,12 @@ void *Finisher::finisher_thread_entry()
   finisher_lock.Lock();
   ldout(cct, 10) << "finisher_thread start" << dendl;
 
+  utime_t start;
   while (!finisher_stop) {
     /// Every time we are woken up, we process the queue until it is empty.
     while (!finisher_queue.empty()) {
+      if (logger)
+        start = ceph_clock_now(cct);
       // To reduce lock contention, we swap out the queue to process.
       // This way other threads can submit new contexts to complete while we are working.
       vector<Context*> ls;
@@ -73,8 +76,10 @@ void *Finisher::finisher_thread_entry()
 	  c->complete(ls_rval.front().second);
 	  ls_rval.pop_front();
 	}
-	if (logger)
+	if (logger) {
 	  logger->dec(l_finisher_queue_len);
+          logger->tinc(l_finisher_complete_lat, ceph_clock_now(cct) - start);
+        }
       }
       ldout(cct, 10) << "finisher_thread done with " << ls << dendl;
       ls.clear();
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 8767445..95db977 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -27,6 +27,7 @@ class CephContext;
 enum {
   l_finisher_first = 997082,
   l_finisher_queue_len,
+  l_finisher_complete_lat,
   l_finisher_last
 };
 
@@ -146,9 +147,11 @@ class Finisher {
     PerfCountersBuilder b(cct, string("finisher-") + name,
 			  l_finisher_first, l_finisher_last);
     b.add_u64(l_finisher_queue_len, "queue_len");
+    b.add_time_avg(l_finisher_complete_lat, "complete_latency");
     logger = b.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
     logger->set(l_finisher_queue_len, 0);
+    logger->set(l_finisher_complete_lat, 0);
   }
 
   ~Finisher() {
diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc
index 7c166ef..f5ce058 100644
--- a/src/common/Formatter.cc
+++ b/src/common/Formatter.cc
@@ -19,7 +19,9 @@
 #include "assert.h"
 #include "Formatter.h"
 #include "common/escape.h"
+#include "include/buffer.h"
 
+#include <algorithm>
 #include <iostream>
 #include <sstream>
 #include <stdarg.h>
@@ -31,6 +33,9 @@
 #include <boost/format.hpp>
 
 
+static char tolower_underscore(const char b) {
+  return ' ' == b ? '_' : std::tolower(b);
+}
 
 // -----------------------
 namespace ceph {
@@ -89,6 +94,14 @@ Formatter *Formatter::create(const std::string &type,
     return (Formatter *) NULL;
 }
 
+
+void Formatter::flush(bufferlist &bl)
+{
+  std::stringstream os;
+  flush(os);
+  bl.append(os.str());
+}
+
 void Formatter::dump_format(const char *name, const char *fmt, ...)
 {
   va_list ap;
@@ -308,8 +321,9 @@ void JSONFormatter::write_raw_data(const char *data)
 const char *XMLFormatter::XML_1_DTD =
   "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
 
-XMLFormatter::XMLFormatter(bool pretty)
-: m_pretty(pretty)
+XMLFormatter::XMLFormatter(bool pretty, bool lowercased_underscored)
+: m_pretty(pretty),
+  m_lowercased_underscored(lowercased_underscored)
 {
   reset();
 }
@@ -370,6 +384,10 @@ void XMLFormatter::close_section()
   finish_pending_string();
 
   std::string section = m_sections.back();
+  if (m_lowercased_underscored) {
+    std::transform(section.begin(), section.end(), section.begin(),
+          tolower_underscore);
+  }
   m_sections.pop_back();
   print_spaces();
   m_ss << "</" << section << ">";
@@ -380,6 +398,9 @@ void XMLFormatter::close_section()
 void XMLFormatter::dump_unsigned(const char *name, uint64_t u)
 {
   std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
   print_spaces();
   m_ss << "<" << e << ">" << u << "</" << e << ">";
   if (m_pretty)
@@ -389,6 +410,9 @@ void XMLFormatter::dump_unsigned(const char *name, uint64_t u)
 void XMLFormatter::dump_int(const char *name, int64_t u)
 {
   std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
   print_spaces();
   m_ss << "<" << e << ">" << u << "</" << e << ">";
   if (m_pretty)
@@ -398,6 +422,9 @@ void XMLFormatter::dump_int(const char *name, int64_t u)
 void XMLFormatter::dump_float(const char *name, double d)
 {
   std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
   print_spaces();
   m_ss << "<" << e << ">" << d << "</" << e << ">";
   if (m_pretty)
@@ -407,6 +434,9 @@ void XMLFormatter::dump_float(const char *name, double d)
 void XMLFormatter::dump_string(const char *name, const std::string& s)
 {
   std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
   print_spaces();
   m_ss << "<" << e << ">" << escape_xml_str(s.c_str()) << "</" << e << ">";
   if (m_pretty)
@@ -416,6 +446,9 @@ void XMLFormatter::dump_string(const char *name, const std::string& s)
 void XMLFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs)
 {
   std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
   std::string attrs_str;
   get_attrs_str(&attrs, attrs_str);
   print_spaces();
@@ -438,6 +471,9 @@ void XMLFormatter::dump_format_va(const char* name, const char *ns, bool quoted,
   vsnprintf(buf, LARGE_SIZE, fmt, ap);
 
   std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
   print_spaces();
   if (ns) {
     m_ss << "<" << e << " xmlns=\"" << ns << "\">" << buf << "</" << e << ">";
@@ -481,10 +517,15 @@ void XMLFormatter::open_section_in_ns(const char *name, const char *ns, const Fo
     get_attrs_str(attrs, attrs_str);
   }
 
+  std::string e(name);
+  if (m_lowercased_underscored) {
+    std::transform(e.begin(), e.end(), e.begin(), tolower_underscore);
+  }
+
   if (ns) {
-    m_ss << "<" << name << attrs_str << " xmlns=\"" << ns << "\">";
+    m_ss << "<" << e << attrs_str << " xmlns=\"" << ns << "\">";
   } else {
-    m_ss << "<" << name << attrs_str << ">";
+    m_ss << "<" << e << attrs_str << ">";
   }
   if (m_pretty)
     m_ss << "\n";
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
index 181a0e0..3784bdb 100644
--- a/src/common/Formatter.h
+++ b/src/common/Formatter.h
@@ -14,7 +14,7 @@
 #include <string>
 #include <map>
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 
 namespace ceph {
 
@@ -41,12 +41,7 @@ namespace ceph {
     virtual ~Formatter();
 
     virtual void flush(std::ostream& os) = 0;
-    void flush(bufferlist &bl)
-    {
-      std::stringstream os;
-      flush(os);
-      bl.append(os.str());
-    }
+    void flush(bufferlist &bl);
     virtual void reset() = 0;
 
     virtual void open_array_section(const char *name) = 0;
@@ -133,7 +128,7 @@ namespace ceph {
   class XMLFormatter : public Formatter {
   public:
     static const char *XML_1_DTD;
-    XMLFormatter(bool pretty = false);
+    XMLFormatter(bool pretty = false, bool lowercased_underscored = false);
 
     void flush(std::ostream& os);
     void reset();
@@ -165,6 +160,7 @@ namespace ceph {
     std::stringstream m_ss, m_pending_string;
     std::deque<std::string> m_sections;
     bool m_pretty;
+    bool m_lowercased_underscored;
     std::string m_pending_string_name;
   };
 
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 12572e6..182295f 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -68,7 +68,10 @@ libcommon_internal_la_SOURCES = \
 	common/Readahead.cc \
 	common/Cycles.cc \
 	common/ContextCompletion.cc \
-	common/TracepointProvider.cc
+	common/TracepointProvider.cc \
+	common/PluginRegistry.cc
+
+common/PluginRegistry.cc: ./ceph_ver.h
 
 if ENABLE_SERVER
 libcommon_internal_la_SOURCES += \
@@ -89,12 +92,16 @@ libcommon_internal_la_SOURCES += \
         common/solaris_errno.cc
 endif
 
-if WITH_RBD
+if AIX
+libcommon_internal_la_SOURCES += \
+        common/aix_errno.cc
+endif
+
+# used by RBD and FileStore
 if LINUX
 libcommon_internal_la_SOURCES += \
 	common/blkdev.cc
 endif
-endif
 
 if ENABLE_XIO
 libcommon_internal_la_SOURCES += \
@@ -254,7 +261,9 @@ noinst_HEADERS += \
 	common/bit_vector.hpp \
 	common/SubProcess.h \
 	common/valgrind.h \
-	common/TracepointProvider.h
+	common/TracepointProvider.h \
+	common/event_socket.h \
+	common/PluginRegistry.h
 
 if ENABLE_XIO
 noinst_HEADERS += \
diff --git a/src/common/MemoryModel.cc b/src/common/MemoryModel.cc
index ddc7fa9..336c82a 100644
--- a/src/common/MemoryModel.cc
+++ b/src/common/MemoryModel.cc
@@ -4,6 +4,7 @@
 #include "MemoryModel.h"
 #include "common/config.h"
 #include "debug.h"
+#include <malloc.h>
 
 #include <fstream>
 
diff --git a/src/common/PluginRegistry.cc b/src/common/PluginRegistry.cc
new file mode 100644
index 0000000..fb02d4a
--- /dev/null
+++ b/src/common/PluginRegistry.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include <errno.h>
+#include <dlfcn.h>
+
+#include "PluginRegistry.h"
+#include "ceph_ver.h"
+#include "common/ceph_context.h"
+#include "common/errno.h"
+#include "include/str_list.h"
+
+#include "common/debug.h"
+
+#define PLUGIN_PREFIX "libceph_"
+#define PLUGIN_SUFFIX ".so"
+#define PLUGIN_INIT_FUNCTION "__ceph_plugin_init"
+#define PLUGIN_VERSION_FUNCTION "__ceph_plugin_version"
+
+#define dout_subsys ceph_subsys_context
+
+PluginRegistry::PluginRegistry(CephContext *cct) :
+  cct(cct),
+  lock("PluginRegistry::lock"),
+  loading(false),
+  disable_dlclose(false)
+{
+}
+
+PluginRegistry::~PluginRegistry()
+{
+  if (disable_dlclose)
+    return;
+
+  for (std::map<std::string,std::map<std::string, Plugin*> >::iterator i =
+	 plugins.begin();
+       i != plugins.end();
+       ++i) {
+    for (std::map<std::string,Plugin*>::iterator j = i->second.begin();
+	 j != i->second.end(); ++j) {
+      void *library = j->second->library;
+      delete j->second;
+      dlclose(library);
+    }
+  }
+}
+
+int PluginRegistry::remove(const std::string& type, const std::string& name)
+{
+  assert(lock.is_locked());
+
+  std::map<std::string,std::map<std::string,Plugin*> >::iterator i =
+    plugins.find(type);
+  if (i == plugins.end())
+    return -ENOENT;
+  std::map<std::string,Plugin*>::iterator j = i->second.find(name);
+  if (j == i->second.end())
+    return -ENOENT;
+
+  ldout(cct, 1) << __func__ << " " << type << " " << name << dendl;
+  void *library = j->second->library;
+  delete j->second;
+  dlclose(library);
+  i->second.erase(j);
+  if (i->second.empty())
+    plugins.erase(i);
+
+  return 0;
+}
+
+int PluginRegistry::add(const std::string& type,
+			const std::string& name,
+			Plugin* plugin)
+{
+  assert(lock.is_locked());
+  if (plugins.count(type) &&
+      plugins[type].count(name)) {
+    return -EEXIST;
+  }
+  ldout(cct, 1) << __func__ << " " << type << " " << name
+		<< " " << plugin << dendl;
+  plugins[type][name] = plugin;
+  return 0;
+}
+
+Plugin *PluginRegistry::get_with_load(const std::string& type,
+          const std::string& name)
+{
+  Mutex::Locker l(lock);
+  Plugin* ret = get(type, name);
+  if (!ret) {
+    int err = load(type, name);
+    if (err == 0)
+      ret = get(type, name);
+  } 
+  return ret;
+}
+
+Plugin *PluginRegistry::get(const std::string& type,
+			    const std::string& name)
+{
+  assert(lock.is_locked());
+  Plugin *ret = 0;
+
+  std::map<std::string,Plugin*>::iterator j;
+  std::map<std::string,map<std::string,Plugin*> >::iterator i =
+    plugins.find(type);
+  if (i == plugins.end()) 
+    goto out;
+  j = i->second.find(name);
+  if (j == i->second.end()) 
+    goto out;
+  ret = j->second;
+
+ out:
+  ldout(cct, 1) << __func__ << " " << type << " " << name
+		<< " = " << ret << dendl;
+  return ret;
+}
+
+int PluginRegistry::load(const std::string &type,
+			 const std::string &name)
+{
+  assert(lock.is_locked());
+  ldout(cct, 1) << __func__ << " " << type << " " << name << dendl;
+
+  std::string fname = cct->_conf->plugin_dir + "/" + type + "/" PLUGIN_PREFIX
+    + name + PLUGIN_SUFFIX;
+  void *library = dlopen(fname.c_str(), RTLD_NOW);
+  if (!library) {
+    lderr(cct) << __func__ << " failed dlopen(" << fname << "): "
+	       << dlerror() << dendl;
+    return -EIO;
+  }
+
+  const char * (*code_version)() =
+    (const char *(*)())dlsym(library, PLUGIN_VERSION_FUNCTION);
+  if (code_version == NULL) {
+    lderr(cct) << __func__ << " code_version == NULL" << dlerror() << dendl;
+    return -EXDEV;
+  }
+  if (code_version() != string(CEPH_GIT_NICE_VER)) {
+    lderr(cct) << __func__ << " plugin " << fname << " version "
+	       << code_version() << " != expected "
+	       << CEPH_GIT_NICE_VER << dendl;
+    dlclose(library);
+    return -EXDEV;
+  }
+
+  int (*code_init)(CephContext *,
+		   const std::string& type,
+		   const std::string& name) =
+    (int (*)(CephContext *,
+	     const std::string& type,
+	     const std::string& name))dlsym(library, PLUGIN_INIT_FUNCTION);
+  if (code_init) {
+    int r = code_init(cct, type, name);
+    if (r != 0) {
+      lderr(cct) << __func__ << " " << fname << " "
+		 << PLUGIN_INIT_FUNCTION << "(" << cct
+		 << "," << type << "," << name << "): " << cpp_strerror(r)
+		 << dendl;
+      dlclose(library);
+      return r;
+    }
+  } else {
+    lderr(cct) << __func__ << " " << fname << " dlsym(" << PLUGIN_INIT_FUNCTION
+	       << "): " << dlerror() << dendl;
+    dlclose(library);
+    return -ENOENT;
+  }
+
+  Plugin *plugin = get(type, name);
+  if (plugin == 0) {
+    lderr(cct) << __func__ << " " << fname << " "
+	       << PLUGIN_INIT_FUNCTION << "()"
+	       << "did not register plugin type " << type << " name " << name
+	       << dendl;
+    dlclose(library);
+    return -EBADF;
+  }
+
+  plugin->library = library;
+
+  ldout(cct, 1) << __func__ << ": " << type << " " << name
+		<< " loaded and registered" << dendl;
+  return 0;
+}
+
+/*
+int ErasureCodePluginRegistry::preload(const std::string &plugins,
+				       const std::string &directory,
+				       ostream &ss)
+{
+  Mutex::Locker l(lock);
+  list<string> plugins_list;
+  get_str_list(plugins, plugins_list);
+  for (list<string>::iterator i = plugins_list.begin();
+       i != plugins_list.end();
+       ++i) {
+    ErasureCodePlugin *plugin;
+    int r = load(*i, directory, &plugin, ss);
+    if (r)
+      return r;
+  }
+  return 0;
+}
+*/
diff --git a/src/common/PluginRegistry.h b/src/common/PluginRegistry.h
new file mode 100644
index 0000000..6757ce1
--- /dev/null
+++ b/src/common/PluginRegistry.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#ifndef CEPH_COMMON_PLUGINREGISTRY_H
+#define CEPH_COMMON_PLUGINREGISTRY_H
+
+#include <string>
+#include <map>
+
+#include "common/Mutex.h"
+
+class CephContext;
+
+extern "C" {
+  const char *__ceph_plugin_version();
+  int __ceph_plugin_init(CephContext *cct,
+			 const std::string& type,
+			 const std::string& name);
+}
+
+namespace ceph {
+
+  class Plugin {
+  public:
+    void *library;
+    CephContext *cct;
+
+    Plugin(CephContext *cct) : cct(cct) {}
+    virtual ~Plugin() {}
+  };
+
+  class PluginRegistry {
+  public:
+    CephContext *cct;
+    Mutex lock;
+    bool loading;
+    bool disable_dlclose;
+    std::map<std::string,std::map<std::string,Plugin*> > plugins;
+
+    PluginRegistry(CephContext *cct);
+    ~PluginRegistry();
+
+    int add(const std::string& type, const std::string& name,
+	    Plugin *factory);
+    int remove(const std::string& type, const std::string& name);
+    Plugin *get(const std::string& type, const std::string& name);
+    Plugin *get_with_load(const std::string& type, const std::string& name);
+
+    int load(const std::string& type,
+	     const std::string& name);
+    int preload();
+    int preload(const std::string& type);
+  };
+}
+
+#endif
diff --git a/src/common/PrebufferedStreambuf.cc b/src/common/PrebufferedStreambuf.cc
index d6db8bb..fe27ef5 100644
--- a/src/common/PrebufferedStreambuf.cc
+++ b/src/common/PrebufferedStreambuf.cc
@@ -1,6 +1,6 @@
 
 #include "common/PrebufferedStreambuf.h"
-
+#include <string.h>
 PrebufferedStreambuf::PrebufferedStreambuf(char *buf, size_t len)
   : m_buf(buf), m_buf_len(len)
 {
@@ -15,7 +15,7 @@ PrebufferedStreambuf::int_type PrebufferedStreambuf::overflow(int_type c)
 {
   int old_len = m_overflow.size();
   if (old_len == 0) {
-    m_overflow.resize(m_buf_len);
+    m_overflow.resize(80);
   } else {
     m_overflow.resize(old_len * 2);
   }
@@ -61,3 +61,42 @@ std::string PrebufferedStreambuf::get_str() const
     return std::string(m_buf, this->pptr() - m_buf);
   }  
 }
+// returns current size of content
+size_t PrebufferedStreambuf::size() const
+{
+  if (m_overflow.size() == 0) {
+    return this->pptr() - m_buf;
+  } else {
+    return m_buf_len + this->pptr() - &m_overflow[0];
+  }
+}
+
+// extracts up to avail chars of content
+int PrebufferedStreambuf::snprintf(char* dst, size_t avail) const
+{
+  size_t o_size = m_overflow.size();
+  size_t len_a;
+  size_t len_b;
+  if (o_size>0) {
+    len_a = m_buf_len;
+    len_b = this->pptr() - &m_overflow[0];
+  } else {
+    len_a = this->pptr() - m_buf;
+    len_b = 0;
+  }
+  if (avail > len_a + len_b) {
+    memcpy(dst, m_buf, len_a);
+    memcpy(dst + m_buf_len, m_overflow.c_str(), len_b);
+    dst[len_a + len_b] = 0;
+  } else {
+    if (avail > len_a) {
+      memcpy(dst, m_buf, len_a);
+      memcpy(dst + m_buf_len, m_overflow.c_str(), avail - len_a - 1);
+      dst[avail - 1] = 0;
+    } else {
+      memcpy(dst, m_buf, avail - 1);
+      dst[avail - 1] = 0;
+    }
+  }
+  return len_a + len_b;
+}
diff --git a/src/common/PrebufferedStreambuf.h b/src/common/PrebufferedStreambuf.h
index 80a89aa..ac12381 100644
--- a/src/common/PrebufferedStreambuf.h
+++ b/src/common/PrebufferedStreambuf.h
@@ -37,6 +37,12 @@ public:
 
   /// return a string copy (inefficiently)
   std::string get_str() const;
-};    
+
+  // returns current size of content
+  size_t size() const;
+
+  // extracts up to avail chars of content
+  int snprintf(char* dst, size_t avail) const;
+};
 
 #endif
diff --git a/src/common/Readahead.cc b/src/common/Readahead.cc
index b1ee2e0..55f74db 100644
--- a/src/common/Readahead.cc
+++ b/src/common/Readahead.cc
@@ -18,8 +18,7 @@ Readahead::Readahead()
     m_readahead_trigger_pos(0),
     m_readahead_size(0),
     m_pending(0),
-    m_pending_lock("Readahead::m_pending_lock"),
-    m_pending_cond() {
+    m_pending_lock("Readahead::m_pending_lock") {
 }
 
 Readahead::~Readahead() {
@@ -135,19 +134,34 @@ void Readahead::dec_pending(int count) {
   assert(m_pending >= count);
   m_pending -= count;
   if (m_pending == 0) {
-    m_pending_cond.Signal();
+    std::list<Context *> pending_waiting(std::move(m_pending_waiting));
+    m_pending_lock.Unlock();
+
+    for (auto ctx : pending_waiting) {
+      ctx->complete(0);
+    }
+  } else {
+    m_pending_lock.Unlock();
   }
-  m_pending_lock.Unlock();
 }
 
 void Readahead::wait_for_pending() {
+  C_SaferCond ctx;
+  wait_for_pending(&ctx);
+  ctx.wait();
+}
+
+void Readahead::wait_for_pending(Context *ctx) {
   m_pending_lock.Lock();
-  while (m_pending > 0) {
-    m_pending_cond.Wait(m_pending_lock);
+  if (m_pending > 0) {
+    m_pending_lock.Unlock();
+    m_pending_waiting.push_back(ctx);
+    return;
   }
   m_pending_lock.Unlock();
-}
 
+  ctx->complete(0);
+}
 void Readahead::set_trigger_requests(int trigger_requests) {
   m_lock.Lock();
   m_trigger_requests = trigger_requests;
diff --git a/src/common/Readahead.h b/src/common/Readahead.h
index ffce39f..75822c5 100644
--- a/src/common/Readahead.h
+++ b/src/common/Readahead.h
@@ -6,6 +6,7 @@
 
 #include "Mutex.h"
 #include "Cond.h"
+#include <list>
 
 /**
    This class provides common state and logic for code that needs to perform readahead
@@ -70,6 +71,7 @@ public:
      Waits until the pending count reaches 0.
    */
   void wait_for_pending();
+  void wait_for_pending(Context *ctx);
 
   /**
      Sets the number of sequential requests necessary to trigger readahead.
@@ -146,8 +148,8 @@ private:
   /// Lock for m_pending
   Mutex m_pending_lock;
 
-  /// Signalled when m_pending reaches 0
-  Cond m_pending_cond;
+  /// Waiters for pending readahead
+  std::list<Context *> m_pending_waiting;
 };
 
 #endif
diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h
index 8b8ffc7..82070ac 100644
--- a/src/common/SubProcess.h
+++ b/src/common/SubProcess.h
@@ -30,6 +30,7 @@
 
 #include <sstream>
 #include <vector>
+#include <iostream>
 
 #include <include/assert.h>
 #include <common/errno.h>
@@ -40,7 +41,7 @@
  *
  * Example:
  *
- *   SubProcess cat("cat", true, true);
+ *   SubProcess cat("cat", SubProcess::PIPE, SubProcess::PIPE);
  *   if (cat.spawn() != 0) {
  *     std::cerr << "cat failed: " << cat.err() << std::endl;
  *     return false;
@@ -56,8 +57,16 @@
 
 class SubProcess {
 public:
-  SubProcess(const char *cmd, bool pipe_stdin = false, bool pipe_stdout = false,
-	     bool pipe_stderr = false);
+  enum std_fd_op{
+    KEEP,
+    CLOSE,
+    PIPE
+  };
+public:
+  SubProcess(const char *cmd,
+             std_fd_op stdin_op = CLOSE,
+             std_fd_op stdout_op = CLOSE,
+             std_fd_op stderr_op = CLOSE);
   virtual ~SubProcess();
 
   void add_cmd_args(const char *arg, ...);
@@ -90,9 +99,9 @@ private:
 protected:
   std::string cmd;
   std::vector<std::string> cmd_args;
-  bool pipe_stdin;
-  bool pipe_stdout;
-  bool pipe_stderr;
+  std_fd_op stdin_op;
+  std_fd_op stdout_op;
+  std_fd_op stderr_op;
   int stdin_pipe_out_fd;
   int stdout_pipe_in_fd;
   int stderr_pipe_in_fd;
@@ -102,8 +111,8 @@ protected:
 
 class SubProcessTimed : public SubProcess {
 public:
-  SubProcessTimed(const char *cmd, bool pipe_stdin = false,
-		  bool pipe_stdout = false, bool pipe_stderr = false,
+  SubProcessTimed(const char *cmd, std_fd_op stdin_op = CLOSE,
+		  std_fd_op stdout_op = CLOSE, std_fd_op stderr_op = CLOSE,
 		  int timeout = 0, int sigkill = SIGKILL);
 
 protected:
@@ -114,12 +123,12 @@ private:
   int sigkill;
 };
 
-SubProcess::SubProcess(const char *cmd_, bool use_stdin, bool use_stdout, bool use_stderr) :
+inline SubProcess::SubProcess(const char *cmd_, std_fd_op stdin_op_, std_fd_op stdout_op_, std_fd_op stderr_op_) :
   cmd(cmd_),
   cmd_args(),
-  pipe_stdin(use_stdin),
-  pipe_stdout(use_stdout),
-  pipe_stderr(use_stderr),
+  stdin_op(stdin_op_),
+  stdout_op(stdout_op_),
+  stderr_op(stderr_op_),
   stdin_pipe_out_fd(-1),
   stdout_pipe_in_fd(-1),
   stderr_pipe_in_fd(-1),
@@ -127,14 +136,14 @@ SubProcess::SubProcess(const char *cmd_, bool use_stdin, bool use_stdout, bool u
   errstr() {
 }
 
-SubProcess::~SubProcess() {
+inline SubProcess::~SubProcess() {
   assert(!is_spawned());
   assert(stdin_pipe_out_fd == -1);
   assert(stdout_pipe_in_fd == -1);
   assert(stderr_pipe_in_fd == -1);
 }
 
-void SubProcess::add_cmd_args(const char *arg, ...) {
+inline void SubProcess::add_cmd_args(const char *arg, ...) {
   assert(!is_spawned());
 
   va_list ap;
@@ -147,34 +156,34 @@ void SubProcess::add_cmd_args(const char *arg, ...) {
   va_end(ap);
 }
 
-void SubProcess::add_cmd_arg(const char *arg) {
+inline void SubProcess::add_cmd_arg(const char *arg) {
   assert(!is_spawned());
 
   cmd_args.push_back(arg);
 }
 
-int SubProcess::get_stdin() const {
+inline int SubProcess::get_stdin() const {
   assert(is_spawned());
-  assert(pipe_stdin);
+  assert(stdin_op == PIPE);
 
   return stdin_pipe_out_fd;
 }
 
-int SubProcess::get_stdout() const {
+inline int SubProcess::get_stdout() const {
   assert(is_spawned());
-  assert(pipe_stdout);
+  assert(stdout_op == PIPE);
 
   return stdout_pipe_in_fd;
 }
 
-int SubProcess::get_stderr() const {
+inline int SubProcess::get_stderr() const {
   assert(is_spawned());
-  assert(pipe_stderr);
+  assert(stderr_op == PIPE);
 
   return stderr_pipe_in_fd;
 }
 
-void SubProcess::close(int &fd) {
+inline void SubProcess::close(int &fd) {
   if (fd == -1)
     return;
 
@@ -182,35 +191,35 @@ void SubProcess::close(int &fd) {
   fd = -1;
 }
 
-void SubProcess::close_stdin() {
+inline void SubProcess::close_stdin() {
   assert(is_spawned());
-  assert(pipe_stdin);
+  assert(stdin_op == PIPE);
 
   close(stdin_pipe_out_fd);
 }
 
-void SubProcess::close_stdout() {
+inline void SubProcess::close_stdout() {
   assert(is_spawned());
-  assert(pipe_stdout);
+  assert(stdout_op == PIPE);
 
   close(stdout_pipe_in_fd);
 }
 
-void SubProcess::close_stderr() {
+inline void SubProcess::close_stderr() {
   assert(is_spawned());
-  assert(pipe_stderr);
+  assert(stderr_op == PIPE);
 
   close(stderr_pipe_in_fd);
 }
 
-void SubProcess::kill(int signo) const {
+inline void SubProcess::kill(int signo) const {
   assert(is_spawned());
 
   int ret = ::kill(pid, signo);
   assert(ret == 0);
 }
 
-const char* SubProcess::err() const {
+inline const char* SubProcess::err() const {
   return errstr.str().c_str();
 }
 
@@ -233,7 +242,7 @@ protected:
   }
 };
 
-int SubProcess::spawn() {
+inline int SubProcess::spawn() {
   assert(!is_spawned());
   assert(stdin_pipe_out_fd == -1);
   assert(stdout_pipe_in_fd == -1);
@@ -247,9 +256,9 @@ int SubProcess::spawn() {
 
   int ret = 0;
 
-  if ((pipe_stdin  && ::pipe(ipipe) == -1) ||
-      (pipe_stdout && ::pipe(opipe) == -1) ||
-      (pipe_stderr && ::pipe(epipe) == -1)) {
+  if ((stdin_op == PIPE  && ::pipe(ipipe) == -1) ||
+      (stdout_op == PIPE && ::pipe(opipe) == -1) ||
+      (stderr_op == PIPE && ::pipe(epipe) == -1)) {
     ret = -errno;
     errstr << "pipe failed: " << cpp_strerror(errno);
     goto fail;
@@ -290,11 +299,11 @@ int SubProcess::spawn() {
     if (maxfd == -1)
       maxfd = 16384;
     for (int fd = 0; fd <= maxfd; fd++) {
-      if (fd == STDIN_FILENO && pipe_stdin)
+      if (fd == STDIN_FILENO && stdin_op != CLOSE)
 	continue;
-      if (fd == STDOUT_FILENO && pipe_stdout)
+      if (fd == STDOUT_FILENO && stdout_op != CLOSE)
 	continue;
-      if (fd == STDERR_FILENO && pipe_stderr)
+      if (fd == STDERR_FILENO && stderr_op != CLOSE)
 	continue;
       ::close(fd);
     }
@@ -317,7 +326,7 @@ fail:
   return ret;
 }
 
-void SubProcess::exec() {
+inline void SubProcess::exec() {
   assert(is_child());
 
   std::vector<const char *> args;
@@ -336,7 +345,7 @@ void SubProcess::exec() {
   _exit(EXIT_FAILURE);
 }
 
-int SubProcess::join() {
+inline int SubProcess::join() {
   assert(is_spawned());
 
   close(stdin_pipe_out_fd);
@@ -363,10 +372,10 @@ int SubProcess::join() {
   return EXIT_FAILURE;
 }
 
-SubProcessTimed::SubProcessTimed(const char *cmd, bool pipe_stdin,
-				 bool pipe_stdout, bool pipe_stderr,
+inline SubProcessTimed::SubProcessTimed(const char *cmd, std_fd_op stdin_op,
+				 std_fd_op stdout_op, std_fd_op stderr_op,
 				 int timeout_, int sigkill_) :
-  SubProcess(cmd, pipe_stdin, pipe_stdout, pipe_stderr),
+  SubProcess(cmd, stdin_op, stdout_op, stderr_op),
   timeout(timeout_),
   sigkill(sigkill_) {
 }
@@ -377,7 +386,7 @@ static void timeout_sighandler(int sig) {
 }
 static void dummy_sighandler(int sig) {}
 
-void SubProcessTimed::exec() {
+inline void SubProcessTimed::exec() {
   assert(is_child());
 
   if (timeout <= 0) {
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index 7c3ccb5..a6e7972 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -411,6 +411,10 @@ public:
     }
 
     virtual void process(T *item) = 0;
+    void process_finish() {
+      Mutex::Locker locker(m_pool->_lock);
+      _void_process_finish(nullptr);
+    }
 
     T *front() {
       assert(m_pool->_lock.is_locked());
@@ -423,6 +427,9 @@ public:
       Mutex::Locker pool_locker(m_pool->_lock);
       m_pool->_cond.SignalOne();
     }
+    Mutex &get_pool_lock() {
+      return m_pool->_lock;
+    }
   private:
     ThreadPool *m_pool;
     std::list<T *> m_items;
diff --git a/src/common/addr_parsing.c b/src/common/addr_parsing.c
index d50b10e..ae75557 100644
--- a/src/common/addr_parsing.c
+++ b/src/common/addr_parsing.c
@@ -16,7 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#if defined(__FreeBSD__)
+#if defined(__FreeBSD__) || defined(_AIX)
 #include <sys/socket.h>
 #include <netinet/in.h>
 #endif
diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h
index bbbaa29..bad235a 100644
--- a/src/common/admin_socket.h
+++ b/src/common/admin_socket.h
@@ -20,7 +20,7 @@
 
 #include <string>
 #include <map>
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "common/cmdparse.h"
 
 class AdminSocket;
diff --git a/src/common/aix_errno.cc b/src/common/aix_errno.cc
new file mode 100644
index 0000000..48ef728
--- /dev/null
+++ b/src/common/aix_errno.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_host_errno(__s32 r) 
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -ECHRNG;
+      case -45:
+        return -EL2NSYNC;
+      case -46:
+        return -EL3HLT;
+      case -47:
+        return -EL3RST;
+      case -48:
+        return -ELNRNG;
+      case -49:
+        return -EUNATCH;
+      case -51:
+        return -EL2HLT;
+      case -52:
+        return -EPERM; //TODO EBADE
+      case -53:
+        return -EPERM; //TODO EBADR
+      case -54:
+        return -EPERM; //TODO EXFULL
+      case -55:
+        return -EPERM; //TODO ENOANO
+      case -56:
+        return -EPERM; //TODO EBADRQC
+      case -57:
+        return -EPERM; //TODO EBADSLT
+      case -59:
+        return -EPERM; //TODO EBFONT
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      case -64:
+        return -EPERM; //TODO ENONET
+      case -65:
+        return -EPERM; //TODO ENOPKG
+      case -66:
+        return -EREMOTE;
+      case -67:
+        return -ENOLINK;
+      case -68:
+        return -EPERM; //TODO EADV 
+      case -69:
+        return -EPERM; //TODO ESRMNT 
+      case -70:
+        return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT 
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -EPERM; //TODO ENOTUNIQ
+      case -77:
+        return -EPERM; //TODO EBADFD
+      case -78:
+        return -EPERM; //TODO EREMCHG
+      case -79:
+        return -EPERM; //TODO ELIBACC
+      case -80:
+        return -EPERM; //TODO ELIBBAD 
+      case -81:
+        return -EPERM; //TODO ELIBSCN
+      case -82:
+        return -EPERM; //TODO ELIBMAX
+      case -83:
+	return -EPERM; // TODO ELIBEXEC
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -ERESTART;
+      case -86:
+        return -EPERM; //ESTRPIPE; 
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN 
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EPERM; //TODO EREMOTEIO
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: { 
+        break;
+      }
+    }
+  } 
+  return r; // otherwise return original value
+}
+
+
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 6f200bc..10e01a0 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -48,19 +48,39 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 #endif
 
   static atomic_t buffer_total_alloc;
+  static atomic64_t buffer_history_alloc_bytes;
+  static atomic64_t buffer_history_alloc_num;
   const bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
 
-  void buffer::inc_total_alloc(unsigned len) {
+  namespace {
+  void inc_total_alloc(unsigned len) {
     if (buffer_track_alloc)
       buffer_total_alloc.add(len);
   }
-  void buffer::dec_total_alloc(unsigned len) {
+
+  void dec_total_alloc(unsigned len) {
     if (buffer_track_alloc)
       buffer_total_alloc.sub(len);
   }
+
+  void inc_history_alloc(uint64_t len) {
+    if (buffer_track_alloc) {
+      buffer_history_alloc_bytes.add(len);
+      buffer_history_alloc_num.inc();
+    }
+  }
+  }
+
+
   int buffer::get_total_alloc() {
     return buffer_total_alloc.read();
   }
+  uint64_t buffer::get_history_alloc_bytes() {
+    return buffer_history_alloc_bytes.read();
+  }
+  uint64_t buffer::get_history_alloc_num() {
+    return buffer_history_alloc_num.read();
+  }
 
   static atomic_t buffer_cached_crc;
   static atomic_t buffer_cached_crc_adjusted;
@@ -227,6 +247,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 	data = 0;
       }
       inc_total_alloc(len);
+      inc_history_alloc(len);
       bdout << "raw_malloc " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
     }
     raw_malloc(unsigned l, char *b) : raw(b, l) {
@@ -251,6 +272,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       if (!data)
 	throw bad_alloc();
       inc_total_alloc(len);
+      inc_history_alloc(len);
       bdout << "raw_mmap " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
     }
     ~raw_mmap_pages() {
@@ -280,6 +302,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       if (!data)
 	throw bad_alloc();
       inc_total_alloc(len);
+      inc_history_alloc(len);
       bdout << "raw_posix_aligned " << this << " alloc " << (void *)data << " l=" << l << ", align=" << align << " total_alloc=" << buffer::get_total_alloc() << bendl;
     }
     ~raw_posix_aligned() {
@@ -307,6 +330,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       else
 	data = realdata;
       inc_total_alloc(len+align-1);
+      inc_history_alloc(len+align-1);
       //cout << "hack aligned " << (unsigned)data
       //<< " in raw " << (unsigned)realdata
       //<< " off " << off << std::endl;
@@ -356,6 +380,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       }
 
       inc_total_alloc(len);
+      inc_history_alloc(len);
       bdout << "raw_pipe " << this << " alloc " << len << " "
 	    << buffer::get_total_alloc() << bendl;
     }
@@ -510,6 +535,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       else
 	data = 0;
       inc_total_alloc(len);
+      inc_history_alloc(len);
       bdout << "raw_char " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
     }
     raw_char(unsigned l, char *b) : raw(b, l) {
@@ -2098,11 +2124,11 @@ void buffer::list::hexdump(std::ostream &out) const
   out.flags(original_flags);
 }
 
-std::ostream& operator<<(std::ostream& out, const buffer::raw &r) {
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::raw &r) {
   return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref.read() << ")";
 }
 
-std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::ptr& bp) {
   if (bp.have_raw())
     out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
 	<< " " << (void*)bp.c_str()
@@ -2114,7 +2140,7 @@ std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
   return out;
 }
 
-std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::list& bl) {
   out << "buffer::list(len=" << bl.length() << "," << std::endl;
 
   std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
@@ -2127,9 +2153,8 @@ std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
   return out;
 }
 
-std::ostream& operator<<(std::ostream& out, const buffer::error& e)
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::error& e)
 {
   return out << e.what();
 }
-
 }
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 6e60cf7..4ead871 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -32,6 +32,7 @@
 #include "include/str_list.h"
 #include "common/Mutex.h"
 #include "common/Cond.h"
+#include "common/PluginRegistry.h"
 
 #include <iostream>
 #include <pthread.h>
@@ -413,6 +414,7 @@ CephContext::CephContext(uint32_t module_type_, int init_flags_)
     _heartbeat_map(NULL),
     _crypto_none(NULL),
     _crypto_aes(NULL),
+    _plugin_registry(NULL),
     _lockdep_obs(NULL),
     _cct_perf(NULL)
 {
@@ -438,6 +440,8 @@ CephContext::CephContext(uint32_t module_type_, int init_flags_)
   _admin_socket = new AdminSocket(this);
   _heartbeat_map = new HeartbeatMap(this);
 
+  _plugin_registry = new PluginRegistry(this);
+
   _admin_hook = new CephContextHook(this);
   _admin_socket->register_command("perfcounters_dump", "perfcounters_dump", _admin_hook, "");
   _admin_socket->register_command("1", "1", _admin_hook, "");
@@ -474,6 +478,8 @@ CephContext::~CephContext()
     _cct_perf = NULL;
   }
 
+  delete _plugin_registry;
+
   _admin_socket->unregister_command("perfcounters_dump");
   _admin_socket->unregister_command("perf dump");
   _admin_socket->unregister_command("1");
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index 3820a23..ab187c5 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -21,7 +21,7 @@
 #include <set>
 
 #include "include/assert.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/atomic.h"
 #include "common/cmdparse.h"
 #include "include/Spinlock.h"
@@ -38,6 +38,7 @@ class CephContextObs;
 class CryptoHandler;
 
 namespace ceph {
+  class PluginRegistry;
   class HeartbeatMap;
   namespace log {
     class Log;
@@ -151,6 +152,10 @@ public:
   bool check_experimental_feature_enabled(const std::string& feature,
 					  std::ostream *message);
 
+  PluginRegistry *get_plugin_registry() {
+    return _plugin_registry;
+  }
+
 private:
   struct SingletonWrapper : boost::noncopyable {
     virtual ~SingletonWrapper() {}
@@ -213,6 +218,8 @@ private:
   ceph_spinlock_t _feature_lock;
   std::set<std::string> _experimental_features;
 
+  PluginRegistry *_plugin_registry;
+
   md_config_obs_t *_lockdep_obs;
 
   enum {
diff --git a/src/common/ceph_crypto_cms.h b/src/common/ceph_crypto_cms.h
index 5b0a7f5..11fb000 100644
--- a/src/common/ceph_crypto_cms.h
+++ b/src/common/ceph_crypto_cms.h
@@ -1,7 +1,7 @@
 #ifndef CEPH_CRYPTO_CMS_H
 #define CEPH_CRYPTO_CMS_H
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 
 class CephContext;
 
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index a19cc5d..39f6499 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -69,6 +69,8 @@ OPTION(mon_cluster_log_file_level, OPT_STR, "info")
 
 OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR, "")
 
+OPTION(plugin_dir, OPT_STR, CEPH_PKGLIBDIR)
+
 OPTION(xio_trace_mempool, OPT_BOOL, false) // mempool allocation counters
 OPTION(xio_trace_msgcnt, OPT_BOOL, false) // incoming/outgoing msg counters
 OPTION(xio_trace_xcon, OPT_BOOL, false) // Xio message encode/decode trace
@@ -276,7 +278,8 @@ OPTION(mon_sync_debug_leader, OPT_INT, -1) // monitor to be used as the sync lea
 OPTION(mon_sync_debug_provider, OPT_INT, -1) // monitor to be used as the sync provider
 OPTION(mon_sync_debug_provider_fallback, OPT_INT, -1) // monitor to be used as fallback if sync provider fails
 OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0)  // inject N second delay on each get_chunk request
-OPTION(mon_osd_min_down_reporters, OPT_INT, 2)   // number of OSDs who need to report a down OSD for it to count
+OPTION(mon_osd_min_down_reporters, OPT_INT, 2)   // number of OSDs from different subtrees who need to report a down OSD for it to count
+OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host")   // in which level of parent bucket the reporters are counted
 OPTION(mon_osd_force_trim_to, OPT_INT, 0)   // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
 OPTION(mon_mds_force_trim_to, OPT_INT, 0)   // force mon to trim mdsmaps to this point (dangerous, use with care)
 
@@ -992,6 +995,7 @@ OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before m
 OPTION(rbd_skip_partial_discard, OPT_BOOL, false) // when trying to discard a range inside an object, set to true to skip zeroing the range.
 OPTION(rbd_enable_alloc_hint, OPT_BOOL, true) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
 OPTION(rbd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
+OPTION(rbd_validate_pool, OPT_BOOL, true) // true if empty pools should be validated for RBD compatibility
 
 /*
  * The following options change the behavior for librbd's image creation methods that
@@ -1019,6 +1023,17 @@ OPTION(rbd_default_features, OPT_INT, 3) // only applies to format 2 images
 
 OPTION(rbd_default_map_options, OPT_STR, "") // default rbd map -o / --options
 
+/**
+ * RBD journal options.
+ */
+OPTION(rbd_journal_order, OPT_U32, 24) // bits to shift to compute journal object max size, between 12 and 64
+OPTION(rbd_journal_splay_width, OPT_U32, 4) // number of active journal objects
+OPTION(rbd_journal_commit_age, OPT_DOUBLE, 5) // commit time interval, seconds
+OPTION(rbd_journal_object_flush_interval, OPT_INT, 0) // maximum number of pending commits per journal object
+OPTION(rbd_journal_object_flush_bytes, OPT_INT, 0) // maximum number of pending bytes per journal object
+OPTION(rbd_journal_object_flush_age, OPT_DOUBLE, 0) // maximum age (in seconds) for pending commits
+OPTION(rbd_journal_pool, OPT_STR, "") // pool for journal objects
+
 OPTION(nss_db_path, OPT_STR, "") // path to nss db
 
 
@@ -1153,6 +1168,8 @@ OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object
 OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
 OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multipart upload
 
+OPTION(rgw_max_slo_entries, OPT_INT, 1000) // default number of max entries in slo
+
 OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change
 OPTION(rgw_user_max_buckets, OPT_U32, 1000) // global option to set max buckets count for all user
 
diff --git a/src/common/dout.h b/src/common/dout.h
index f00e4f6..22befd6 100644
--- a/src/common/dout.h
+++ b/src/common/dout.h
@@ -48,7 +48,8 @@ inline std::ostream& operator<<(std::ostream& out, _bad_endl_use_dendl_t) {
     if (0) {								\
       char __array[((v >= -1) && (v <= 200)) ? 0 : -1] __attribute__((unused)); \
     }									\
-    ceph::log::Entry *_dout_e = cct->_log->create_entry(v, sub);	\
+    static size_t _log_exp_length=80; \
+    ceph::log::Entry *_dout_e = cct->_log->create_entry(v, sub, &_log_exp_length);	\
     ostream _dout_os(&_dout_e->m_streambuf);				\
     CephContext *_dout_cct = cct;					\
     std::ostream* _dout = &_dout_os;
diff --git a/src/common/entity_name.h b/src/common/entity_name.h
index 2949f1f..e932975 100644
--- a/src/common/entity_name.h
+++ b/src/common/entity_name.h
@@ -20,7 +20,7 @@
 #include <string>
 
 #include "include/encoding.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "msg/msg_types.h"
 
 /* Represents a Ceph entity name.
diff --git a/src/common/event_socket.h b/src/common/event_socket.h
new file mode 100644
index 0000000..5c6b40b
--- /dev/null
+++ b/src/common/event_socket.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai at xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_SOCKET_H
+#define CEPH_COMMON_EVENT_SOCKET_H
+
+#include "include/event_type.h"
+
+class EventSocket {
+  int socket;
+  int type;
+
+ public:
+  EventSocket(): socket(-1), type(EVENT_SOCKET_TYPE_NONE) {}
+  bool is_valid() const { return socket != -1; }
+  int init(int fd, int t) {
+    switch (t) {
+      case EVENT_SOCKET_TYPE_PIPE:
+#ifdef HAVE_EVENTFD
+      case EVENT_SOCKET_TYPE_EVENTFD:
+#endif
+      {
+        socket = fd;
+        type = t;
+        return 0;
+      }
+    }
+    return -EINVAL;
+  }
+  int notify() {
+    int ret;
+    switch (type) {
+      case EVENT_SOCKET_TYPE_PIPE:
+      {
+        char buf[1];
+        buf[0] = 'i';
+        ret = write(socket, buf, 1);
+        if (ret < 0)
+          ret = -errno;
+        else
+          ret = 0;
+      }
+      case EVENT_SOCKET_TYPE_EVENTFD:
+      {
+        uint64_t value = 1;
+        ret = write(socket, &value, sizeof (value));
+        if (ret < 0)
+          ret = -errno;
+        else
+          ret = 0;
+      }
+      default:
+      {
+        ret = -1;
+      }
+    }
+    return ret;
+  }
+};
+
+#endif
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 4698756..601af40 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -176,6 +176,8 @@ public:
   }
 
   static uint32_t _reverse_bits(uint32_t v) {
+    if (v == 0)
+      return v;
     // reverse bits
     // swap odd and even bits
     v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
diff --git a/src/common/io_priority.cc b/src/common/io_priority.cc
index be4dc2a..579307f 100644
--- a/src/common/io_priority.cc
+++ b/src/common/io_priority.cc
@@ -14,7 +14,9 @@
 
 #include <sys/types.h>
 #include <unistd.h>
+#ifdef __linux__
 #include <sys/syscall.h>   /* For SYS_xxx definitions */
+#endif
 #include <algorithm>
 #include <errno.h>
 
diff --git a/src/common/lockdep.cc b/src/common/lockdep.cc
index 79fd56c..c39dbf0 100644
--- a/src/common/lockdep.cc
+++ b/src/common/lockdep.cc
@@ -53,7 +53,8 @@ static map<int, std::string> lock_names;
 static map<int, int> lock_refs;
 static list<int> free_ids;
 static ceph::unordered_map<pthread_t, map<int,BackTrace*> > held;
-static BackTrace *follows[MAX_LOCKS][MAX_LOCKS];       // follows[a][b] means b taken after a
+static bool follows[MAX_LOCKS][MAX_LOCKS]; // follows[a][b] means b taken after a
+static BackTrace *follows_bt[MAX_LOCKS][MAX_LOCKS];
 
 static bool lockdep_force_backtrace()
 {
@@ -88,9 +89,12 @@ void lockdep_unregister_ceph_context(CephContext *cct)
 
     // blow away all of our state, too, in case it starts up again.
     held.clear();
-    for (unsigned i = 0; i < MAX_LOCKS; ++i)
-      for (unsigned j = 0; j < MAX_LOCKS; ++j)
-	follows[i][j] = NULL;
+    for (unsigned i = 0; i < MAX_LOCKS; ++i) {
+      for (unsigned j = 0; j < MAX_LOCKS; ++j) {
+        follows[i][j] = false;
+        follows_bt[i][j] = NULL;
+      }
+    }
     lock_names.clear();
     lock_ids.clear();
     lock_refs.clear();
@@ -162,11 +166,13 @@ void lockdep_unregister(int id)
   if (--refs == 0) {
     // reset dependency ordering
     for (int i=0; i<MAX_LOCKS; ++i) {
-      delete follows[id][i];
-      follows[id][i] = NULL;
+      delete follows_bt[id][i];
+      follows_bt[id][i] = NULL;
+      follows[id][i] = false;
 
-      delete follows[i][id];
-      follows[i][id] = NULL;
+      delete follows_bt[i][id];
+      follows_bt[i][id] = NULL;
+      follows[i][id] = false;
     }
 
     lockdep_dout(10) << "unregistered '" << p->second << "' from " << id
@@ -191,7 +197,9 @@ static bool does_follow(int a, int b)
     *_dout << "------------------------------------" << "\n";
     *_dout << "existing dependency " << lock_names[a] << " (" << a << ") -> "
            << lock_names[b] << " (" << b << ") at:\n";
-    follows[a][b]->print(*_dout);
+    if (follows_bt[a][b]) {
+      follows_bt[a][b]->print(*_dout);
+    }
     *_dout << dendl;
     return true;
   }
@@ -201,7 +209,9 @@ static bool does_follow(int a, int b)
 	does_follow(i, b)) {
       lockdep_dout(0) << "existing intermediate dependency " << lock_names[a]
           << " (" << a << ") -> " << lock_names[i] << " (" << i << ") at:\n";
-      follows[a][i]->print(*_dout);
+      if (follows_bt[a][i]) {
+        follows_bt[a][i]->print(*_dout);
+      }
       *_dout << dendl;
       return true;
     }
@@ -271,7 +281,8 @@ int lockdep_will_lock(const char *name, int id, bool force_backtrace)
         if (force_backtrace || lockdep_force_backtrace()) {
           bt = new BackTrace(BACKTRACE_SKIP);
         }
-	follows[p->first][id] = bt;
+        follows[p->first][id] = true;
+        follows_bt[p->first][id] = bt;
 	lockdep_dout(10) << lock_names[p->first] << " -> " << name << " at" << dendl;
 	//bt->print(*_dout);
       }
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
index 34f4067..74dfa7d 100644
--- a/src/common/perf_counters.h
+++ b/src/common/perf_counters.h
@@ -18,7 +18,6 @@
 
 #include "common/config_obs.h"
 #include "common/Mutex.h"
-#include "include/buffer.h"
 #include "include/utime.h"
 
 #include <stdint.h>
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
index e16692d..b4c2e88 100644
--- a/src/crush/CrushCompiler.cc
+++ b/src/crush/CrushCompiler.cc
@@ -1,6 +1,10 @@
 
 #include "CrushCompiler.h"
 
+#if defined(_AIX)
+#define EBADE ECORRUPT
+#endif
+
 #ifndef EBADE
 #define EBADE EFTYPE
 #endif
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
index 11b536c..d10aab9 100644
--- a/src/crush/CrushTester.cc
+++ b/src/crush/CrushTester.cc
@@ -359,7 +359,7 @@ int CrushTester::test_with_crushtool(const char *crushtool_cmd,
 				     int max_id, int timeout,
 				     int ruleset)
 {
-  SubProcessTimed crushtool(crushtool_cmd, true, false, true, timeout);
+  SubProcessTimed crushtool(crushtool_cmd, SubProcess::PIPE, SubProcess::CLOSE, SubProcess::PIPE, timeout);
   string opt_max_id = boost::lexical_cast<string>(max_id);
   crushtool.add_cmd_args(
     "-i", "-",
diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index d8d5490..6d83d44 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -22,6 +22,7 @@
 
 #include "common/strtol.h"
 #include "ErasureCode.h"
+#include "include/buffer.h"
 
 const unsigned ErasureCode::SIMD_ALIGN = 32;
 
diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h
index 5eb5571..3ac2b89 100644
--- a/src/erasure-code/ErasureCodeInterface.h
+++ b/src/erasure-code/ErasureCodeInterface.h
@@ -145,7 +145,7 @@
 #include <vector>
 #include <iostream>
 #include "include/memory.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 
 class CrushWrapper;
 
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index 609c7ea..c0df0ee 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -284,6 +284,7 @@ void global_init_daemonize(CephContext *cct)
   if (global_init_prefork(cct) < 0)
     return;
 
+#if !defined(_AIX)
   int ret = daemon(1, 1);
   if (ret) {
     ret = errno;
@@ -294,6 +295,9 @@ void global_init_daemonize(CephContext *cct)
 
   global_init_postfork_start(cct);
   global_init_postfork_finish(cct);
+#else
+# warning daemon not supported on aix
+#endif
 }
 
 void global_init_postfork_start(CephContext *cct)
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index 9e699ad..560f2fd 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -26,6 +26,11 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#if defined(_AIX)
+extern char *sys_siglist[]; 
+#endif 
+
+
 void install_sighandler(int signum, signal_handler_t handler, int flags)
 {
   int ret;
@@ -40,7 +45,7 @@ void install_sighandler(int signum, signal_handler_t handler, int flags)
   ret = sigaction(signum, &act, &oldact);
   if (ret != 0) {
     char buf[1024];
-#if defined(__sun) 
+#if defined(__sun)
     char message[SIG2STR_MAX];
     sig2str(signum,message);
     snprintf(buf, sizeof(buf), "install_sighandler: sigaction returned "
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index a364b29..887ac55 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -9,6 +9,7 @@ rados_include_DATA = \
 	$(srcdir)/include/rados/rados_types.hpp \
 	$(srcdir)/include/rados/librados.hpp \
 	$(srcdir)/include/buffer.h \
+	$(srcdir)/include/buffer_fwd.h \
 	$(srcdir)/include/page.h \
 	$(srcdir)/include/crc32c.h \
 	$(srcdir)/include/memory.h
@@ -52,6 +53,7 @@ noinst_HEADERS += \
 	include/bitmapper.h \
 	include/blobhash.h \
 	include/buffer.h \
+	include/buffer_fwd.h \
 	include/byteorder.h \
 	include/cephfs/libcephfs.h \
 	include/ceph_features.h \
@@ -102,6 +104,7 @@ noinst_HEADERS += \
 	include/rados/page.h \
 	include/rados/crc32c.h \
 	include/rados/buffer.h \
+	include/rados/buffer_fwd.h \
 	include/radosstriper/libradosstriper.h \
 	include/radosstriper/libradosstriper.hpp \
 	include/rbd/features.h \
@@ -115,4 +118,5 @@ noinst_HEADERS += \
 	include/rados/memory.h \
 	include/unordered_set.h \
 	include/unordered_map.h \
-	include/timegm.h
+	include/timegm.h \
+	include/event_type.h
diff --git a/src/include/assert.h b/src/include/assert.h
index 89e8753..e13ab9d 100644
--- a/src/include/assert.h
+++ b/src/include/assert.h
@@ -11,7 +11,7 @@
 #elif defined(__FreeBSD__)
 #include <sys/cdefs.h>
 #define	__GNUC_PREREQ(minor, major)	__GNUC_PREREQ__(minor, major)
-#elif defined(__sun)
+#elif defined(__sun) || defined(_AIX)
 #include "include/compat.h"
 #include <assert.h>
 #endif
diff --git a/src/include/buffer.h b/src/include/buffer.h
index 5f90d7b..2962357 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -47,6 +47,7 @@
 
 #include "page.h"
 #include "crc32c.h"
+#include "buffer_fwd.h"
 
 #ifdef __CEPH__
 # include "include/assert.h"
@@ -69,12 +70,11 @@ namespace ceph {
 
 const static int CEPH_BUFFER_APPEND_SIZE(4096);
 
-class CEPH_BUFFER_API buffer {
+namespace buffer CEPH_BUFFER_API {
   /*
    * exceptions
    */
 
-public:
   struct error : public std::exception{
     const char *what() const throw ();
   };
@@ -99,28 +99,28 @@ public:
 
 
   /// total bytes allocated
-  static int get_total_alloc();
+  int get_total_alloc();
+
+  /// history total bytes allocated
+  uint64_t get_history_alloc_bytes();
+
+  /// total num allocated
+  uint64_t get_history_alloc_num();
 
   /// enable/disable alloc tracking
-  static void track_alloc(bool b);
+  void track_alloc(bool b);
 
   /// count of cached crc hits (matching input)
-  static int get_cached_crc();
+  int get_cached_crc();
   /// count of cached crc hits (mismatching input, required adjustment)
-  static int get_cached_crc_adjusted();
+  int get_cached_crc_adjusted();
   /// enable/disable tracking of cached crcs
-  static void track_cached_crc(bool b);
+  void track_cached_crc(bool b);
 
   /// count of calls to buffer::ptr::c_str()
-  static int get_c_str_accesses();
+  int get_c_str_accesses();
   /// enable/disable tracking of buffer::ptr::c_str() calls
-  static void track_c_str(bool b);
-
-private:
- 
-  /* hack for memory utilization debugging. */
-  static void inc_total_alloc(unsigned len);
-  static void dec_total_alloc(unsigned len);
+  void track_c_str(bool b);
 
   /*
    * an abstract raw buffer.  with a reference count.
@@ -135,25 +135,23 @@ private:
   class raw_pipe;
   class raw_unshareable; // diagnostic, unshareable char buffer
 
-  friend std::ostream& operator<<(std::ostream& out, const raw &r);
 
-public:
   class xio_mempool;
   class xio_msg_buffer;
 
   /*
    * named constructors 
    */
-  static raw* copy(const char *c, unsigned len);
-  static raw* create(unsigned len);
-  static raw* claim_char(unsigned len, char *buf);
-  static raw* create_malloc(unsigned len);
-  static raw* claim_malloc(unsigned len, char *buf);
-  static raw* create_static(unsigned len, char *buf);
-  static raw* create_aligned(unsigned len, unsigned align);
-  static raw* create_page_aligned(unsigned len);
-  static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
-  static raw* create_unshareable(unsigned len);
+  raw* copy(const char *c, unsigned len);
+  raw* create(unsigned len);
+  raw* claim_char(unsigned len, char *buf);
+  raw* create_malloc(unsigned len);
+  raw* claim_malloc(unsigned len, char *buf);
+  raw* create_static(unsigned len, char *buf);
+  raw* create_aligned(unsigned len, unsigned align);
+  raw* create_page_aligned(unsigned len);
+  raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
+  raw* create_unshareable(unsigned len);
 
 #if defined(HAVE_XIO)
   static raw* create_msg(unsigned len, char *buf, XioDispatchHook *m_hook);
@@ -247,7 +245,6 @@ public:
 
   };
 
-  friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
 
   /*
    * list - the useful bit!
@@ -360,7 +357,7 @@ public:
       append_buffer = buffer::create(prealloc);
       append_buffer.set_length(0);   // unused, so far.
     }
-    ~list() {}
+
     list(const list& other) : _buffers(other._buffers), _len(other._len),
 			      _memcopy_count(other._memcopy_count), last_p(this) {
       make_shareable();
@@ -555,16 +552,6 @@ public:
       return crc;
     }
   };
-};
-
-#if defined(HAVE_XIO)
-xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
-#endif
-
-typedef buffer::ptr bufferptr;
-typedef buffer::list bufferlist;
-typedef buffer::hash bufferhash;
-
 
 inline bool operator>(bufferlist& l, bufferlist& r) {
   for (unsigned p = 0; ; p++) {
@@ -603,6 +590,7 @@ inline bool operator<=(bufferlist& l, bufferlist& r) {
 
 std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
 
+std::ostream& operator<<(std::ostream& out, const raw &r);
 
 std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
 
@@ -614,4 +602,10 @@ inline bufferhash& operator<<(bufferhash& l, bufferlist &r) {
 }
 }
 
+#if defined(HAVE_XIO)
+xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
+#endif
+
+}
+
 #endif
diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h
new file mode 100644
index 0000000..1646eff
--- /dev/null
+++ b/src/include/buffer_fwd.h
@@ -0,0 +1,17 @@
+#ifndef BUFFER_FWD_H
+#define BUFFER_FWD_H
+
+namespace ceph {
+  namespace buffer {
+    class ptr;
+    class list;
+    class hash;
+  }
+
+  using bufferptr = buffer::ptr;
+  using bufferlist = buffer::list;
+  using bufferhash = buffer::hash;
+}
+
+#endif
+
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
index d6ff8b6..e76d035 100644
--- a/src/include/byteorder.h
+++ b/src/include/byteorder.h
@@ -42,6 +42,10 @@
 # endif
 #endif
 
+#if defined(_AIX)
+# define CEPH_BIG_ENDIAN
+#endif
+
 
 
 
diff --git a/src/include/compat.h b/src/include/compat.h
index dcb5f6f..ec5905c 100644
--- a/src/include/compat.h
+++ b/src/include/compat.h
@@ -53,11 +53,15 @@
 #define lseek64(fd, offset, whence) lseek(fd, offset, whence)
 #endif
 
-#if defined(__sun)
+#if defined(__sun) || defined(_AIX)
 #define LOG_AUTHPRIV    (10<<3)
 #define LOG_FTP         (11<<3)
 #define __STRING(x)     "x"
 #define IFTODT(mode)   (((mode) & 0170000) >> 12)
 #endif
 
+#if defined(_AIX)
+#define MSG_DONTWAIT MSG_NONBLOCK
+#endif
+
 #endif /* !CEPH_COMPAT_H */
diff --git a/src/os/fs/XFS.h b/src/include/event_type.h
similarity index 55%
copy from src/os/fs/XFS.h
copy to src/include/event_type.h
index 1c3c3c4..aa6dded 100644
--- a/src/os/fs/XFS.h
+++ b/src/include/event_type.h
@@ -3,7 +3,9 @@
 /*
  * Ceph - scalable distributed file system
  *
- * Copyright (C) 2014 Red Hat
+ * Copyright (C) 2015 XSky <haomai at xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang at gmail.com>
  *
  * This is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -12,20 +14,11 @@
  *
  */
 
-#ifndef CEPH_OS_XFS_H
-#define CEPH_OS_XFS_H
-
-#include "FS.h"
-
-# ifndef XFS_SUPER_MAGIC
-static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
-# endif
+#ifndef CEPH_COMMON_EVENT_TYPE_H
+#define CEPH_COMMON_EVENT_TYPE_H
 
-class XFS : public FS {
-  const char *get_name() {
-    return "xfs";
-  }
-  int set_alloc_hint(int fd, uint64_t hint);
-};
+#define EVENT_SOCKET_TYPE_NONE 0
+#define EVENT_SOCKET_TYPE_PIPE 1
+#define EVENT_SOCKET_TYPE_EVENTFD 2
 
 #endif
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
index 5f90d7b..2962357 100644
--- a/src/include/rados/buffer.h
+++ b/src/include/rados/buffer.h
@@ -47,6 +47,7 @@
 
 #include "page.h"
 #include "crc32c.h"
+#include "buffer_fwd.h"
 
 #ifdef __CEPH__
 # include "include/assert.h"
@@ -69,12 +70,11 @@ namespace ceph {
 
 const static int CEPH_BUFFER_APPEND_SIZE(4096);
 
-class CEPH_BUFFER_API buffer {
+namespace buffer CEPH_BUFFER_API {
   /*
    * exceptions
    */
 
-public:
   struct error : public std::exception{
     const char *what() const throw ();
   };
@@ -99,28 +99,28 @@ public:
 
 
   /// total bytes allocated
-  static int get_total_alloc();
+  int get_total_alloc();
+
+  /// history total bytes allocated
+  uint64_t get_history_alloc_bytes();
+
+  /// total num allocated
+  uint64_t get_history_alloc_num();
 
   /// enable/disable alloc tracking
-  static void track_alloc(bool b);
+  void track_alloc(bool b);
 
   /// count of cached crc hits (matching input)
-  static int get_cached_crc();
+  int get_cached_crc();
   /// count of cached crc hits (mismatching input, required adjustment)
-  static int get_cached_crc_adjusted();
+  int get_cached_crc_adjusted();
   /// enable/disable tracking of cached crcs
-  static void track_cached_crc(bool b);
+  void track_cached_crc(bool b);
 
   /// count of calls to buffer::ptr::c_str()
-  static int get_c_str_accesses();
+  int get_c_str_accesses();
   /// enable/disable tracking of buffer::ptr::c_str() calls
-  static void track_c_str(bool b);
-
-private:
- 
-  /* hack for memory utilization debugging. */
-  static void inc_total_alloc(unsigned len);
-  static void dec_total_alloc(unsigned len);
+  void track_c_str(bool b);
 
   /*
    * an abstract raw buffer.  with a reference count.
@@ -135,25 +135,23 @@ private:
   class raw_pipe;
   class raw_unshareable; // diagnostic, unshareable char buffer
 
-  friend std::ostream& operator<<(std::ostream& out, const raw &r);
 
-public:
   class xio_mempool;
   class xio_msg_buffer;
 
   /*
    * named constructors 
    */
-  static raw* copy(const char *c, unsigned len);
-  static raw* create(unsigned len);
-  static raw* claim_char(unsigned len, char *buf);
-  static raw* create_malloc(unsigned len);
-  static raw* claim_malloc(unsigned len, char *buf);
-  static raw* create_static(unsigned len, char *buf);
-  static raw* create_aligned(unsigned len, unsigned align);
-  static raw* create_page_aligned(unsigned len);
-  static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
-  static raw* create_unshareable(unsigned len);
+  raw* copy(const char *c, unsigned len);
+  raw* create(unsigned len);
+  raw* claim_char(unsigned len, char *buf);
+  raw* create_malloc(unsigned len);
+  raw* claim_malloc(unsigned len, char *buf);
+  raw* create_static(unsigned len, char *buf);
+  raw* create_aligned(unsigned len, unsigned align);
+  raw* create_page_aligned(unsigned len);
+  raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
+  raw* create_unshareable(unsigned len);
 
 #if defined(HAVE_XIO)
   static raw* create_msg(unsigned len, char *buf, XioDispatchHook *m_hook);
@@ -247,7 +245,6 @@ public:
 
   };
 
-  friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
 
   /*
    * list - the useful bit!
@@ -360,7 +357,7 @@ public:
       append_buffer = buffer::create(prealloc);
       append_buffer.set_length(0);   // unused, so far.
     }
-    ~list() {}
+
     list(const list& other) : _buffers(other._buffers), _len(other._len),
 			      _memcopy_count(other._memcopy_count), last_p(this) {
       make_shareable();
@@ -555,16 +552,6 @@ public:
       return crc;
     }
   };
-};
-
-#if defined(HAVE_XIO)
-xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
-#endif
-
-typedef buffer::ptr bufferptr;
-typedef buffer::list bufferlist;
-typedef buffer::hash bufferhash;
-
 
 inline bool operator>(bufferlist& l, bufferlist& r) {
   for (unsigned p = 0; ; p++) {
@@ -603,6 +590,7 @@ inline bool operator<=(bufferlist& l, bufferlist& r) {
 
 std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
 
+std::ostream& operator<<(std::ostream& out, const raw &r);
 
 std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
 
@@ -614,4 +602,10 @@ inline bufferhash& operator<<(bufferhash& l, bufferlist &r) {
 }
 }
 
+#if defined(HAVE_XIO)
+xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
+#endif
+
+}
+
 #endif
diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h
new file mode 100644
index 0000000..1646eff
--- /dev/null
+++ b/src/include/rados/buffer_fwd.h
@@ -0,0 +1,17 @@
+#ifndef BUFFER_FWD_H
+#define BUFFER_FWD_H
+
+namespace ceph {
+  namespace buffer {
+    class ptr;
+    class list;
+    class hash;
+  }
+
+  using bufferptr = buffer::ptr;
+  using bufferlist = buffer::list;
+  using bufferhash = buffer::hash;
+}
+
+#endif
+
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index f13737b..bda583a 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -145,7 +145,7 @@ enum {
 typedef void *rados_t;
 
 /**
- * @tyepdef rados_config_t
+ * @typedef rados_config_t
  *
  * A handle for the ceph configuration context for the rados_t cluster
  * instance.  This can be used to share configuration context/state
@@ -789,7 +789,7 @@ CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool,
 CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name);
 
 /**
- * Attempt to change an io context's associated auid "owner."
+ * Attempt to change an io context's associated auid "owner"
  *
  * Requires that you have write permission on both the current and new
  * auid.
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index ba64219..e2d63f9 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -25,6 +25,7 @@ extern "C" {
 #elif defined(__FreeBSD__)
 #include <sys/types.h>
 #endif
+#include <stdbool.h>
 #include <string.h>
 #include "../rados/librados.h"
 #include "features.h"
@@ -65,6 +66,15 @@ typedef struct {
 #define RBD_MAX_IMAGE_NAME_SIZE 96
 #define RBD_MAX_BLOCK_NAME_SIZE 24
 
+/**
+ * These types used to in set_image_notification to indicate the type of event
+ * socket passed in.
+ */
+enum {
+  EVENT_TYPE_PIPE = 1,
+  EVENT_TYPE_EVENTFD = 2
+};
+
 typedef struct {
   uint64_t size;
   uint64_t obj_size;
@@ -75,6 +85,12 @@ typedef struct {
   char parent_name[RBD_MAX_IMAGE_NAME_SIZE];  /* deprecated */
 } rbd_image_info_t;
 
+typedef struct {
+  char *cluster_uuid;
+  char *cluster_name;
+  char *client_name;
+} rbd_mirror_peer_t;
+
 CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra);
 
 /* image options */
@@ -84,6 +100,9 @@ enum {
   RBD_IMAGE_OPTION_ORDER = 2,
   RBD_IMAGE_OPTION_STRIPE_UNIT = 3,
   RBD_IMAGE_OPTION_STRIPE_COUNT = 4,
+  RBD_IMAGE_OPTION_JOURNAL_ORDER = 5,
+  RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6,
+  RBD_IMAGE_OPTION_JOURNAL_POOL = 7,
 };
 
 CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
@@ -146,6 +165,26 @@ CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
 CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
                             const char *destname);
 
+/* pool mirroring */
+CEPH_RBD_API int rbd_mirror_is_enabled(rados_ioctx_t io_ctx, bool *enabled);
+CEPH_RBD_API int rbd_mirror_set_enabled(rados_ioctx_t io_ctx, bool enabled);
+CEPH_RBD_API int rbd_mirror_peer_add(rados_ioctx_t io_ctx,
+                                     const char *cluster_uuid,
+                                     const char *cluster_name,
+                                     const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_remove(rados_ioctx_t io_ctx,
+                                        const char *cluster_name);
+CEPH_RBD_API int rbd_mirror_peer_list(rados_ioctx_t io_ctx,
+                                      rbd_mirror_peer_t *peers, int *max_peers);
+CEPH_RBD_API void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers,
+                                               int max_peers);
+CEPH_RBD_API int rbd_mirror_peer_set_client(rados_ioctx_t io_ctx,
+                                            const char *cluster_uuid,
+                                            const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_set_cluster(rados_ioctx_t io_ctx,
+                                             const char *cluster_uuid,
+                                             const char *cluster_name);
+
 CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name,
                           rbd_image_t *image, const char *snap_name);
 
@@ -191,6 +230,7 @@ CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image,
 			             char *parent_snapname,
                                      size_t psnapnamelen);
 CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
+CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type);
 
 /* exclusive lock feature */
 CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner);
@@ -471,12 +511,14 @@ CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
                                char *buf, rbd_completion_t c, int op_flags);
 CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
                                  rbd_completion_t c);
+
 CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
                                            rbd_callback_t complete_cb,
                                            rbd_completion_t *c);
 CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c);
 CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c);
 CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c);
+CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c);
 CEPH_RBD_API void rbd_aio_release(rbd_completion_t c);
 CEPH_RBD_API int rbd_flush(rbd_image_t image);
 /**
@@ -497,6 +539,8 @@ CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
  */
 CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image);
 
+CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp);
+
 CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len);
 CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value);
 CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key);
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index a64ffa6..288e450 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -46,6 +46,12 @@ namespace librbd {
     std::string address;
   } locker_t;
 
+  typedef struct {
+    std::string cluster_uuid;
+    std::string cluster_name;
+    std::string client_name;
+  } mirror_peer_t;
+
   typedef rbd_image_info_t image_info_t;
 
   class CEPH_RBD_API ProgressContext
@@ -70,6 +76,7 @@ public:
     bool is_complete();
     int wait_for_complete();
     ssize_t get_return_value();
+    void *get_arg();
     void release();
   };
 
@@ -101,6 +108,19 @@ public:
   int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
   int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
 
+  // RBD pool mirroring support functions
+  int mirror_is_enabled(IoCtx& io_ctx, bool *enabled);
+  int mirror_set_enabled(IoCtx& io_ctx, bool enabled);
+  int mirror_peer_add(IoCtx& io_ctx, const std::string &cluster_uuid,
+                      const std::string &cluster_name,
+                      const std::string &client_name);
+  int mirror_peer_remove(IoCtx& io_ctx, const std::string &cluster_uuid);
+  int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers);
+  int mirror_peer_set_client(IoCtx& io_ctx, const std::string &cluster_uuid,
+                             const std::string &client_name);
+  int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &cluster_uuid,
+                              const std::string &cluster_name);
+
 private:
   /* We don't allow assignment or copying */
   RBD(const RBD& rhs);
@@ -147,6 +167,7 @@ public:
   int update_features(uint64_t features, bool enabled);
   int overlap(uint64_t *overlap);
   int get_flags(uint64_t *flags);
+  int set_image_notification(int fd, int type);
 
   /* exclusive lock feature */
   int is_exclusive_lock_owner(bool *is_owner);
@@ -283,6 +304,8 @@ public:
    */
   int invalidate_cache();
 
+  int poll_io_events(RBD::AioCompletion **comps, int numcomp);
+
   int metadata_get(const std::string &key, std::string *value);
   int metadata_set(const std::string &key, const std::string &value);
   int metadata_remove(const std::string &key);
diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h
index ad1c1b9..2cee7ee 100644
--- a/src/include/rbd_types.h
+++ b/src/include/rbd_types.h
@@ -58,6 +58,13 @@
 #define RBD_CHILDREN		"rbd_children"
 #define RBD_LOCK_NAME		"rbd_lock"
 
+/**
+ * rbd_pool_settings object in each pool contains pool-specific settings
+ * for configuring features such as async image mirroring to other Ceph
+ * clusters.
+ */
+#define RBD_POOL_SETTINGS       "rbd_pool_settings"
+
 #define RBD_DEFAULT_OBJ_ORDER	22   /* 4MB */
 
 #define RBD_MAX_OBJ_NAME_SIZE	96
diff --git a/src/include/stringify.h b/src/include/stringify.h
index 0a4c4dc..d7b90ed 100644
--- a/src/include/stringify.h
+++ b/src/include/stringify.h
@@ -6,7 +6,12 @@
 
 template<typename T>
 inline std::string stringify(const T& a) {
+#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER))
+  static __thread std::ostringstream ss;
+  ss.str("");
+#else
   std::ostringstream ss;
+#endif
   ss << a;
   return ss.str();
 }
diff --git a/src/include/types.h b/src/include/types.h
index bf369f3..aebdc52 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -83,7 +83,7 @@ typedef off_t loff_t;
 typedef off_t off64_t;
 #endif
 
-#ifdef __sun
+#if defined(__sun) || defined(_AIX)
 typedef off_t loff_t;
 #endif
 
@@ -520,7 +520,7 @@ WRITE_EQ_OPERATORS_1(shard_id_t, id)
 WRITE_CMP_OPERATORS_1(shard_id_t, id)
 ostream &operator<<(ostream &lhs, const shard_id_t &rhs);
 
-#if defined(__sun)
+#if defined(__sun) || defined(_AIX)
 __s32  ceph_to_host_errno(__s32 e);
 #else
 #define  ceph_to_host_errno(e) (e)
diff --git a/src/include/utime.h b/src/include/utime.h
index 30780d1..27241e0 100644
--- a/src/include/utime.h
+++ b/src/include/utime.h
@@ -239,12 +239,22 @@ public:
     time_t tt = sec();
     localtime_r(&tt, &bdt);
 
-    return snprintf(out, outlen,
+    return ::snprintf(out, outlen,
 		    "%04d-%02d-%02d %02d:%02d:%02d.%06ld",
 		    bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday,
 		    bdt.tm_hour, bdt.tm_min, bdt.tm_sec, usec());
   }
 
+  static int snprintf(char *out, int outlen, time_t tt) {
+    struct tm bdt;
+    localtime_r(&tt, &bdt);
+
+    return ::snprintf(out, outlen,
+        "%04d-%02d-%02d %02d:%02d:%02d",
+        bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday,
+        bdt.tm_hour, bdt.tm_min, bdt.tm_sec);
+  }
+
   static int parse_date(const string& date, uint64_t *epoch, uint64_t *nsec,
                         string *out_date=NULL, string *out_time=NULL) {
     struct tm tm;
diff --git a/src/init-ceph.in b/src/init-ceph.in
index a8f7a99..9d4fc71 100755
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -142,6 +142,10 @@ dofsumount=0
 verbose=0
 use_default_conf=1
 
+## set variables like cluster or conf
+[ -e /etc/sysconfig/ceph ] && . /etc/sysconfig/ceph
+[ -e /etc/default/ceph ] && . /etc/default/ceph
+
 
 while echo $1 | grep -q '^-'; do     # FIXME: why not '^-'?
 case $1 in
@@ -313,7 +317,7 @@ for name in $what; do
 	lockfile=""
     fi
 
-    get_conf asok "$run_dir/ceph-$type.$id.asok" "admin socket"
+    get_conf asok "$run_dir/$cluster-$type.$id.asok" "admin socket"
 
     case "$command" in
 	start)
@@ -335,7 +339,8 @@ for name in $what; do
 	    [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"
 
 	    if [ -n "$SYSTEMD_RUN" ]; then
-		cmd="$SYSTEMD_RUN -r bash -c '$files $cmd --cluster $cluster --setuser ceph --setgroup ceph -f'"
+                time=`date +%s.%N` 
+		cmd="$SYSTEMD_RUN --unit=ceph-$name.$time -r bash -c '$files $cmd --cluster $cluster --setuser ceph --setgroup ceph -f'"
 	    else
 		cmd="$files $wrap $cmd --cluster $cluster --setuser ceph --setgroup ceph $runmode"
 	    fi
@@ -378,10 +383,10 @@ for name in $what; do
 		do_root_cmd_okfail "mkdir -p $fs_path"
 		if [ "$fs_type" = "btrfs" ]; then
 		    echo Mounting Btrfs on $host:$fs_path
-		    do_root_cmd_okfail "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts || mount -t btrfs $fs_opt $first_dev $fs_path"
+		    do_root_cmd_okfail "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; mount -t btrfs $fs_opt $first_dev $fs_path"
 		else
 		    echo Mounting $fs_type on $host:$fs_path
-		    do_root_cmd_okfail "modprobe $fs_type ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts || mount -t $fs_type $fs_opt $first_dev $fs_path"
+		    do_root_cmd_okfail "modprobe $fs_type ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; mount -t $fs_type $fs_opt $first_dev $fs_path"
 		fi
 		if [ "$ERR" != "0" ]; then
 		    EXIT_STATUS=$ERR
@@ -430,8 +435,8 @@ for name in $what; do
 		# these keys.  it's also true for legacy installs
 		# via mkcephfs, which is fine too; there is no harm
 		# in creating these keys.
-		get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
-		if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
+		get_conf mon_data "/var/lib/ceph/mon/$cluster-$id" "mon data"
+		if [ "$mon_data" = "/var/lib/ceph/mon/$cluster-$id" -a "$asok" = "/var/run/ceph/$cluster-mon.$id.asok" ]; then
 		    echo Starting ceph-create-keys on $host...
 		    cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
 		    do_cmd "$cmd2"
diff --git a/src/init-radosgw b/src/init-radosgw
index f18a761..0c37824 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -106,7 +106,7 @@ case "$1" in
             else
                 ulimit -n 32768
                 core_limit=`ceph-conf -n $name 'core file limit'`
-                if [ -z $core_limit ]
+                if [ -z $core_limit ]; then
                     DAEMON_COREFILE_LIMIT=$core_limit
                 fi
                 daemon --user="$user" "$RADOSGW -n $name"
diff --git a/src/java/Makefile.in b/src/java/Makefile.in
index 3a6408e..a8e9497 100644
--- a/src/java/Makefile.in
+++ b/src/java/Makefile.in
@@ -190,6 +190,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -276,6 +277,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
diff --git a/src/journal/FutureImpl.cc b/src/journal/FutureImpl.cc
index 8365733..0ccb46f 100644
--- a/src/journal/FutureImpl.cc
+++ b/src/journal/FutureImpl.cc
@@ -91,7 +91,7 @@ bool FutureImpl::attach(const FlushHandlerPtr &flush_handler) {
 }
 
 void FutureImpl::safe(int r) {
-  Mutex::Locker locker(m_lock);
+  m_lock.Lock();
   assert(!m_safe);
   m_safe = true;
   if (m_return_value == 0) {
@@ -100,12 +100,14 @@ void FutureImpl::safe(int r) {
 
   m_flush_handler.reset();
   if (m_consistent) {
-    finish();
+    finish_unlock();
+  } else {
+    m_lock.Unlock();
   }
 }
 
 void FutureImpl::consistent(int r) {
-  Mutex::Locker locker(m_lock);
+  m_lock.Lock();
   assert(!m_consistent);
   m_consistent = true;
   m_prev_future.reset();
@@ -114,11 +116,13 @@ void FutureImpl::consistent(int r) {
   }
 
   if (m_safe) {
-    finish();
+    finish_unlock();
+  } else {
+    m_lock.Unlock();
   }
 }
 
-void FutureImpl::finish() {
+void FutureImpl::finish_unlock() {
   assert(m_lock.is_locked());
   assert(m_safe && m_consistent);
 
@@ -130,7 +134,6 @@ void FutureImpl::finish() {
        it != contexts.end(); ++it) {
     (*it)->complete(m_return_value);
   }
-  m_lock.Lock();
 }
 
 std::ostream &operator<<(std::ostream &os, const FutureImpl &future) {
diff --git a/src/journal/FutureImpl.h b/src/journal/FutureImpl.h
index d936805..855c958 100644
--- a/src/journal/FutureImpl.h
+++ b/src/journal/FutureImpl.h
@@ -113,7 +113,7 @@ private:
   Contexts m_contexts;
 
   void consistent(int r);
-  void finish();
+  void finish_unlock();
 };
 
 void intrusive_ptr_add_ref(FutureImpl::FlushHandler *p);
diff --git a/src/journal/JournalMetadata.cc b/src/journal/JournalMetadata.cc
index 409b4b9..0cd935f 100644
--- a/src/journal/JournalMetadata.cc
+++ b/src/journal/JournalMetadata.cc
@@ -186,12 +186,12 @@ void JournalMetadata::set_active_set(uint64_t object_set) {
 
 void JournalMetadata::flush_commit_position() {
   {
+    Mutex::Locker timer_locker(m_timer_lock);
     Mutex::Locker locker(m_lock);
     if (m_commit_position_task_ctx == NULL) {
       return;
     }
 
-    Mutex::Locker timer_locker(m_timer_lock);
     m_timer->cancel_event(m_commit_position_task_ctx);
     m_commit_position_task_ctx = NULL;
   }
@@ -202,23 +202,28 @@ void JournalMetadata::set_commit_position(
     const ObjectSetPosition &commit_position, Context *on_safe) {
   assert(on_safe != NULL);
 
-  Mutex::Locker locker(m_lock);
-  ldout(m_cct, 20) << __func__ << ": current=" << m_client.commit_position
-                   << ", new=" << commit_position << dendl;
-  if (commit_position <= m_client.commit_position ||
-      commit_position <= m_commit_position) {
-    on_safe->complete(-ESTALE);
-    return;
-  }
+  Context *stale_ctx = nullptr;
+  {
+    Mutex::Locker timer_locker(m_timer_lock);
+    Mutex::Locker locker(m_lock);
+    ldout(m_cct, 20) << __func__ << ": current=" << m_client.commit_position
+                     << ", new=" << commit_position << dendl;
+    if (commit_position <= m_client.commit_position ||
+        commit_position <= m_commit_position) {
+      stale_ctx = on_safe;
+    } else {
+      stale_ctx = m_commit_position_ctx;
 
-  if (m_commit_position_ctx != NULL) {
-    m_commit_position_ctx->complete(-ESTALE);
+      m_client.commit_position = commit_position;
+      m_commit_position = commit_position;
+      m_commit_position_ctx = on_safe;
+      schedule_commit_task();
+    }
   }
 
-  m_client.commit_position = commit_position;
-  m_commit_position = commit_position;
-  m_commit_position_ctx = on_safe;
-  schedule_commit_task();
+  if (stale_ctx != nullptr) {
+    stale_ctx->complete(-ESTALE);
+  }
 }
 
 void JournalMetadata::reserve_tid(const std::string &tag, uint64_t tid) {
@@ -298,9 +303,9 @@ void JournalMetadata::handle_refresh_complete(C_Refresh *refresh, int r) {
 }
 
 void JournalMetadata::schedule_commit_task() {
+  assert(m_timer_lock.is_locked());
   assert(m_lock.is_locked());
 
-  Mutex::Locker timer_locker(m_timer_lock);
   if (m_commit_position_task_ctx == NULL) {
     m_commit_position_task_ctx = new C_CommitPositionTask(this);
     m_timer->add_event_after(m_commit_interval, m_commit_position_task_ctx);
@@ -357,8 +362,15 @@ void JournalMetadata::handle_watch_notify(uint64_t notify_id, uint64_t cookie) {
 
 void JournalMetadata::handle_watch_error(int err) {
   lderr(m_cct) << "journal watch error: " << cpp_strerror(err) << dendl;
-  Mutex::Locker locker(m_lock);
   Mutex::Locker timer_locker(m_timer_lock);
+  Mutex::Locker locker(m_lock);
+
+  // release old watch on error
+  if (m_watch_handle != 0) {
+    m_ioctx.unwatch2(m_watch_handle);
+    m_watch_handle = 0;
+  }
+
   if (m_initialized && err != -ENOENT) {
     schedule_watch_reset();
   }
diff --git a/src/journal/JournalPlayer.cc b/src/journal/JournalPlayer.cc
index 2f97158..b78f29c 100644
--- a/src/journal/JournalPlayer.cc
+++ b/src/journal/JournalPlayer.cc
@@ -53,9 +53,9 @@ JournalPlayer::JournalPlayer(librados::IoCtx &ioctx,
                              ReplayHandler *replay_handler)
   : m_cct(NULL), m_object_oid_prefix(object_oid_prefix),
     m_journal_metadata(journal_metadata), m_replay_handler(replay_handler),
-    m_process_state(this), m_lock("JournalPlayer::m_lock"), m_state(STATE_INIT),
-    m_splay_offset(0), m_watch_enabled(false), m_watch_scheduled(false),
-    m_watch_interval(0), m_commit_object(0) {
+    m_lock("JournalPlayer::m_lock"), m_state(STATE_INIT), m_splay_offset(0),
+    m_watch_enabled(false), m_watch_scheduled(false), m_watch_interval(0),
+    m_commit_object(0) {
   m_replay_handler->get();
   m_ioctx.dup(ioctx);
   m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
@@ -83,25 +83,29 @@ JournalPlayer::~JournalPlayer() {
 }
 
 void JournalPlayer::prefetch() {
-  m_lock.Lock();
+  Mutex::Locker locker(m_lock);
   assert(m_state == STATE_INIT);
   m_state = STATE_PREFETCH;
 
   uint8_t splay_width = m_journal_metadata->get_splay_width();
-  ldout(m_cct, 10) << __func__ << ": prefetching " << (2 * splay_width) << " "
+  for (uint8_t splay_index = 0; splay_index < splay_width; ++splay_index) {
+    m_prefetch_splay_offsets.insert(splay_index);
+  }
+
+  uint64_t object_set = m_commit_object / splay_width;
+  uint64_t active_set = m_journal_metadata->get_active_set();
+
+  uint32_t object_count = splay_width *
+                          std::min<uint64_t>(2, active_set - object_set + 1);
+  ldout(m_cct, 10) << __func__ << ": prefetching " << object_count << " "
                    << "objects" << dendl;
 
   // prefetch starting from the last known commit set
-  C_PrefetchBatch *ctx = new C_PrefetchBatch(this);
-  uint64_t start_object = (m_commit_object / splay_width) * splay_width;
+  uint64_t start_object = object_set * splay_width;
   for (uint64_t object_number = start_object;
-       object_number < start_object + (2 * splay_width); ++object_number) {
-    ctx->add_fetch();
-    fetch(object_number, ctx);
+       object_number < start_object + object_count; ++object_number) {
+    fetch(object_number);
   }
-  m_lock.Unlock();
-
-  ctx->complete(0);
 }
 
 void JournalPlayer::prefetch_and_watch(double interval) {
@@ -126,6 +130,7 @@ void JournalPlayer::unwatch() {
 }
 
 bool JournalPlayer::try_pop_front(Entry *entry, uint64_t *commit_tid) {
+  ldout(m_cct, 20) << __func__ << dendl;
   Mutex::Locker locker(m_lock);
   if (m_state != STATE_PLAYBACK) {
     return false;
@@ -136,9 +141,12 @@ bool JournalPlayer::try_pop_front(Entry *entry, uint64_t *commit_tid) {
 
   if (object_player->empty()) {
     if (m_watch_enabled && !m_watch_scheduled) {
-      object_player->watch(&m_process_state, m_watch_interval);
+      object_player->watch(
+        new C_Watch(this, object_player->get_object_number()),
+        m_watch_interval);
       m_watch_scheduled = true;
     } else if (!m_watch_enabled && !object_player->is_fetch_in_progress()) {
+      ldout(m_cct, 10) << __func__ << ": replay complete" << dendl;
       m_journal_metadata->get_finisher().queue(new C_HandleComplete(
         m_replay_handler), 0);
     }
@@ -171,11 +179,7 @@ bool JournalPlayer::try_pop_front(Entry *entry, uint64_t *commit_tid) {
     }
   } else {
     advance_splay_object();
-
-    ObjectPlayerPtr next_set_object_player = get_next_set_object_player();
-    if (!next_set_object_player->empty()) {
-      remove_object_player(object_player, &m_process_state);
-    }
+    remove_empty_object_player(object_player);
   }
 
   m_journal_metadata->reserve_tid(entry->get_tag(), entry->get_tid());
@@ -184,18 +188,19 @@ bool JournalPlayer::try_pop_front(Entry *entry, uint64_t *commit_tid) {
   return true;
 }
 
-void JournalPlayer::process_state(int r) {
-  ldout(m_cct, 10) << __func__ << ": r=" << r << dendl;
+void JournalPlayer::process_state(uint64_t object_number, int r) {
+  ldout(m_cct, 10) << __func__ << ": object_num=" << object_number << ", "
+                   << "r=" << r << dendl;
   if (r >= 0) {
     Mutex::Locker locker(m_lock);
     switch (m_state) {
     case STATE_PREFETCH:
       ldout(m_cct, 10) << "PREFETCH" << dendl;
-      r = process_prefetch();
+      r = process_prefetch(object_number);
       break;
     case STATE_PLAYBACK:
       ldout(m_cct, 10) << "PLAYBACK" << dendl;
-      r = process_playback();
+      r = process_playback(object_number);
       break;
     case STATE_ERROR:
       ldout(m_cct, 10) << "ERROR" << dendl;
@@ -216,58 +221,77 @@ void JournalPlayer::process_state(int r) {
   }
 }
 
-int JournalPlayer::process_prefetch() {
-  ldout(m_cct, 10) << __func__ << dendl;
+int JournalPlayer::process_prefetch(uint64_t object_number) {
+  ldout(m_cct, 10) << __func__ << ": object_num=" << object_number << dendl;
   assert(m_lock.is_locked());
 
   uint8_t splay_width = m_journal_metadata->get_splay_width();
-  for (uint8_t splay_offset = 0; splay_offset < splay_width; ++splay_offset) {
-    assert(m_object_players.count(splay_offset) == 1);
+  uint8_t splay_offset = object_number % splay_width;
 
-    ObjectPlayers &object_players = m_object_players[splay_offset];
-    assert(object_players.size() == 2);
+  PrefetchSplayOffsets::iterator it = m_prefetch_splay_offsets.find(
+    splay_offset);
+  if (it == m_prefetch_splay_offsets.end()) {
+    return 0;
+  }
 
-    ObjectPlayerPtr object_player = object_players.begin()->second;
-    assert(!object_player->is_fetch_in_progress());
-
-    ldout(m_cct, 15) << "seeking known commit position in "
-                     << object_player->get_oid() << dendl;
-    Entry entry;
-    while (!m_commit_tids.empty() && !object_player->empty()) {
-      object_player->front(&entry);
-      if (entry.get_tid() > m_commit_tids[entry.get_tag()]) {
-        ldout(m_cct, 10) << "located next uncommitted entry: " << entry
-                         << dendl;
-        break;
-      }
+  bool prefetch_complete = false;
+  assert(m_object_players.count(splay_offset) == 1);
+  ObjectPlayers &object_players = m_object_players[splay_offset];
 
-      ldout(m_cct, 20) << "skipping committed entry: " << entry << dendl;
-      m_journal_metadata->reserve_tid(entry.get_tag(), entry.get_tid());
-      object_player->pop_front();
-    }
+  // prefetch in-order since a newer splay object could prefetch first
+  while (!object_players.begin()->second->is_fetch_in_progress()) {
+    ObjectPlayerPtr object_player = object_players.begin()->second;
 
-    // if this object contains the commit position, our read should start with
-    // the next consistent journal entry in the sequence
-    if (!m_commit_tids.empty() &&
-        object_player->get_object_number() == m_commit_object) {
-      if (object_player->empty()) {
-        advance_splay_object();
-      } else {
-        Entry entry;
+    // skip past known committed records
+    if (!m_commit_tids.empty() && !object_player->empty()) {
+      ldout(m_cct, 15) << "seeking known commit position in "
+                       << object_player->get_oid() << dendl;
+      Entry entry;
+      while (!m_commit_tids.empty() && !object_player->empty()) {
         object_player->front(&entry);
-        if (entry.get_tag() == m_commit_tag) {
+        if (entry.get_tid() > m_commit_tids[entry.get_tag()]) {
+          ldout(m_cct, 10) << "located next uncommitted entry: " << entry
+                           << dendl;
+          break;
+        }
+
+        ldout(m_cct, 20) << "skipping committed entry: " << entry << dendl;
+        m_journal_metadata->reserve_tid(entry.get_tag(), entry.get_tid());
+        object_player->pop_front();
+      }
+
+      // if this object contains the commit position, our read should start with
+      // the next consistent journal entry in the sequence
+      if (!m_commit_tids.empty() &&
+          object_player->get_object_number() == m_commit_object) {
+        if (object_player->empty()) {
           advance_splay_object();
+        } else {
+          Entry entry;
+          object_player->front(&entry);
+          if (entry.get_tag() == m_commit_tag) {
+            advance_splay_object();
+          }
         }
       }
     }
 
-    ObjectPlayerPtr next_set_object_player = get_next_set_object_player();
-    if (object_player->empty() && !next_set_object_player->empty()) {
-      ldout(m_cct, 15) << object_player->get_oid() << " empty" << dendl;
-      remove_object_player(object_player, &m_process_state);
+    // if the object is empty, pre-fetch the next splay object
+    if (!remove_empty_object_player(object_player)) {
+      prefetch_complete = true;
+      break;
     }
   }
 
+  if (!prefetch_complete) {
+    return 0;
+  }
+
+  m_prefetch_splay_offsets.erase(it);
+  if (!m_prefetch_splay_offsets.empty()) {
+    return 0;
+  }
+
   m_state = STATE_PLAYBACK;
   ObjectPlayerPtr object_player = get_object_player();
   if (!object_player->empty()) {
@@ -275,7 +299,9 @@ int JournalPlayer::process_prefetch() {
     m_journal_metadata->get_finisher().queue(new C_HandleEntriesAvailable(
       m_replay_handler), 0);
   } else if (m_watch_enabled) {
-    object_player->watch(&m_process_state, m_watch_interval);
+    object_player->watch(
+      new C_Watch(this, object_player->get_object_number()),
+      m_watch_interval);
     m_watch_scheduled = true;
   } else {
     ldout(m_cct, 10) << __func__ << ": no uncommitted entries available"
@@ -286,17 +312,26 @@ int JournalPlayer::process_prefetch() {
   return 0;
 }
 
-int JournalPlayer::process_playback() {
-  ldout(m_cct, 10) << __func__ << dendl;
+int JournalPlayer::process_playback(uint64_t object_number) {
+  ldout(m_cct, 10) << __func__ << ": object_num=" << object_number << dendl;
   assert(m_lock.is_locked());
 
   m_watch_scheduled = false;
 
   ObjectPlayerPtr object_player = get_object_player();
-  if (!object_player->empty()) {
-    ldout(m_cct, 10) << __func__ << ": entries available" << dendl;
-    m_journal_metadata->get_finisher().queue(new C_HandleEntriesAvailable(
-      m_replay_handler), 0);
+  if (object_player->get_object_number() == object_number) {
+    uint8_t splay_width = m_journal_metadata->get_splay_width();
+    uint64_t active_set = m_journal_metadata->get_active_set();
+    uint64_t object_set = object_player->get_object_number() / splay_width;
+    if (!object_player->empty()) {
+      ldout(m_cct, 10) << __func__ << ": entries available" << dendl;
+      m_journal_metadata->get_finisher().queue(new C_HandleEntriesAvailable(
+        m_replay_handler), 0);
+    } else if (object_set == active_set) {
+      ldout(m_cct, 10) << __func__ << ": replay complete" << dendl;
+      m_journal_metadata->get_finisher().queue(new C_HandleComplete(
+        m_replay_handler), 0);
+    }
   }
   return 0;
 }
@@ -304,14 +339,11 @@ int JournalPlayer::process_playback() {
 const JournalPlayer::ObjectPlayers &JournalPlayer::get_object_players() const {
   assert(m_lock.is_locked());
 
-  assert(m_object_players.count(m_splay_offset) == 1);
   SplayedObjectPlayers::const_iterator it = m_object_players.find(
     m_splay_offset);
   assert(it != m_object_players.end());
 
-  const ObjectPlayers &object_players = it->second;
-  assert(object_players.size() == 2);
-  return object_players;
+  return it->second;
 }
 
 ObjectPlayerPtr JournalPlayer::get_object_player() const {
@@ -336,27 +368,37 @@ void JournalPlayer::advance_splay_object() {
                    << static_cast<uint32_t>(m_splay_offset) << dendl;
 }
 
-void JournalPlayer::remove_object_player(const ObjectPlayerPtr &object_player,
-                                         Context *on_fetch) {
+bool JournalPlayer::remove_empty_object_player(const ObjectPlayerPtr &player) {
   assert(m_lock.is_locked());
 
   uint8_t splay_width = m_journal_metadata->get_splay_width();
+  uint64_t object_set = player->get_object_number() / splay_width;
+  uint64_t active_set = m_journal_metadata->get_active_set();
+  if (!player->empty() || object_set == active_set) {
+    return false;
+  }
+
+  ldout(m_cct, 15) << player->get_oid() << " empty" << dendl;
   ObjectPlayers &object_players = m_object_players[
-    object_player->get_object_number() % splay_width];
+    player->get_object_number() % splay_width];
   assert(!object_players.empty());
-  assert(object_players.begin()->second == object_player);
-  object_players.erase(object_players.begin());
 
-  fetch(object_player->get_object_number() + (2 * splay_width), on_fetch);
+  uint64_t next_object_num = object_players.rbegin()->first + splay_width;
+  uint64_t next_object_set = next_object_num / splay_width;
+  if (next_object_set <= active_set) {
+    fetch(next_object_num);
+  }
+  object_players.erase(player->get_object_number());
+  return true;
 }
 
-void JournalPlayer::fetch(uint64_t object_num, Context *ctx) {
+void JournalPlayer::fetch(uint64_t object_num) {
   assert(m_lock.is_locked());
 
   std::string oid = utils::get_object_name(m_object_oid_prefix, object_num);
 
   ldout(m_cct, 10) << __func__ << ": " << oid << dendl;
-  C_Fetch *fetch_ctx = new C_Fetch(this, object_num, ctx);
+  C_Fetch *fetch_ctx = new C_Fetch(this, object_num);
   ObjectPlayerPtr object_player(new ObjectPlayer(
     m_ioctx, m_object_oid_prefix, object_num, m_journal_metadata->get_timer(),
     m_journal_metadata->get_timer_lock(), m_journal_metadata->get_order()));
@@ -366,42 +408,33 @@ void JournalPlayer::fetch(uint64_t object_num, Context *ctx) {
   object_player->fetch(fetch_ctx);
 }
 
-int JournalPlayer::handle_fetched(int r, uint64_t object_num) {
-  std::string oid = utils::get_object_name(m_object_oid_prefix, object_num);
-
-  ldout(m_cct, 10) << __func__ << ": fetched "
+void JournalPlayer::handle_fetched(uint64_t object_num, int r) {
+  ldout(m_cct, 10) << __func__ << ": "
                    << utils::get_object_name(m_object_oid_prefix, object_num)
                    << ": r=" << r << dendl;
-  if (r < 0 && r != -ENOENT) {
-    return r;
+  if (r == -ENOENT) {
+    r = 0;
   }
-  return 0;
-}
+  if (r == 0) {
+    Mutex::Locker locker(m_lock);
+    uint8_t splay_width = m_journal_metadata->get_splay_width();
+    uint8_t splay_offset = object_num % splay_width;
+    assert(m_object_players.count(splay_offset) == 1);
+    ObjectPlayers &object_players = m_object_players[splay_offset];
 
-JournalPlayer::C_PrefetchBatch::C_PrefetchBatch(JournalPlayer *p)
-  : player(p), lock("JournalPlayer::C_PrefetchBatch::lock"), refs(1),
-    return_value(0) {
-  player->m_async_op_tracker.start_op();
-}
+    assert(object_players.count(object_num) == 1);
+    ObjectPlayerPtr object_player = object_players[object_num];
+    remove_empty_object_player(object_player);
+  }
 
-void JournalPlayer::C_PrefetchBatch::add_fetch() {
-  Mutex::Locker locker(lock);
-  ++refs;
+  process_state(object_num, r);
 }
 
-void JournalPlayer::C_PrefetchBatch::complete(int r) {
-  {
-    Mutex::Locker locker(lock);
-    if (r < 0 && return_value == 0) {
-      return_value = r;
-    }
-    --refs;
-  }
-
-  if (refs == 0) {
-    player->process_state(return_value);
-    delete this;
-  }
+void JournalPlayer::handle_watch(uint64_t object_num, int r) {
+  ldout(m_cct, 10) << __func__ << ": "
+                   << utils::get_object_name(m_object_oid_prefix, object_num)
+                   << ": r=" << r << dendl;
+  process_state(object_num, r);
 }
 
 } // namespace journal
diff --git a/src/journal/JournalPlayer.h b/src/journal/JournalPlayer.h
index 7d48559..49680ad 100644
--- a/src/journal/JournalPlayer.h
+++ b/src/journal/JournalPlayer.h
@@ -39,6 +39,7 @@ public:
   bool try_pop_front(Entry *entry, uint64_t *commit_tid);
 
 private:
+  typedef std::set<uint8_t> PrefetchSplayOffsets;
   typedef std::map<std::string, uint64_t> AllocatedTids;
   typedef std::map<uint64_t, ObjectPlayerPtr> ObjectPlayers;
   typedef std::map<uint8_t, ObjectPlayers> SplayedObjectPlayers;
@@ -50,44 +51,28 @@ private:
     STATE_ERROR
   };
 
-  struct C_ProcessState : public Context {
+  struct C_Watch : public Context {
     JournalPlayer *player;
-    C_ProcessState(JournalPlayer *p) : player(p) {}
-    virtual void complete(int r) {
-      player->process_state(r);
-    }
-    virtual void finish(int r) {}
-  };
-
-  struct C_PrefetchBatch : public Context {
-    JournalPlayer *player;
-    Mutex lock;
-    uint32_t refs;
-    int return_value;
+    uint64_t object_num;
 
-    C_PrefetchBatch(JournalPlayer *p);
-    virtual ~C_PrefetchBatch() {
-      player->m_async_op_tracker.finish_op();
+    C_Watch(JournalPlayer *p, uint64_t o) : player(p), object_num(o) {
+    }
+    virtual void finish(int r) {
+      player->handle_watch(object_num, r);
     }
-    void add_fetch();
-    virtual void complete(int r);
-    virtual void finish(int r) {}
   };
 
   struct C_Fetch : public Context {
     JournalPlayer *player;
     uint64_t object_num;
-    Context *on_fetch;
-    C_Fetch(JournalPlayer *p, uint64_t o, Context *c)
-      : player(p), object_num(o), on_fetch(c) {
+    C_Fetch(JournalPlayer *p, uint64_t o) : player(p), object_num(o) {
       player->m_async_op_tracker.start_op();
     }
     virtual ~C_Fetch() {
       player->m_async_op_tracker.finish_op();
     }
     virtual void finish(int r) {
-      r = player->handle_fetched(r, object_num);
-      on_fetch->complete(r);
+      player->handle_fetched(object_num, r);
     }
   };
 
@@ -98,8 +83,6 @@ private:
 
   ReplayHandler *m_replay_handler;
 
-  C_ProcessState m_process_state;
-
   AsyncOpTracker m_async_op_tracker;
 
   mutable Mutex m_lock;
@@ -110,6 +93,7 @@ private:
   bool m_watch_scheduled;
   double m_watch_interval;
 
+  PrefetchSplayOffsets m_prefetch_splay_offsets;
   SplayedObjectPlayers m_object_players;
   uint64_t m_commit_object;
   std::string m_commit_tag;
@@ -120,15 +104,15 @@ private:
   const ObjectPlayers &get_object_players() const;
   ObjectPlayerPtr get_object_player() const;
   ObjectPlayerPtr get_next_set_object_player() const;
-  void remove_object_player(const ObjectPlayerPtr &object_player,
-                            Context *on_fetch);
+  bool remove_empty_object_player(const ObjectPlayerPtr &object_player);
 
-  void process_state(int r);
-  int process_prefetch();
-  int process_playback();
+  void process_state(uint64_t object_number, int r);
+  int process_prefetch(uint64_t object_number);
+  int process_playback(uint64_t object_number);
 
-  void fetch(uint64_t object_num, Context *ctx);
-  int handle_fetched(int r, uint64_t object_num);
+  void fetch(uint64_t object_num);
+  void handle_fetched(uint64_t object_num, int r);
+  void handle_watch(uint64_t object_num, int r);
 };
 
 } // namespace journal
diff --git a/src/journal/JournalTrimmer.cc b/src/journal/JournalTrimmer.cc
index 33e3be2..9e781ce 100644
--- a/src/journal/JournalTrimmer.cc
+++ b/src/journal/JournalTrimmer.cc
@@ -29,22 +29,29 @@ JournalTrimmer::~JournalTrimmer() {
   m_async_op_tracker.wait_for_ops();
 }
 
-int JournalTrimmer::remove_objects() {
+int JournalTrimmer::remove_objects(bool force) {
   ldout(m_cct, 20) << __func__ << dendl;
   m_async_op_tracker.wait_for_ops();
 
   C_SaferCond ctx;
   {
     Mutex::Locker locker(m_lock);
-    JournalMetadata::RegisteredClients registered_clients;
-    m_journal_metadata->get_registered_clients(&registered_clients);
 
-    if (registered_clients.size() == 0) {
-      return -EINVAL;
-    } else if (registered_clients.size() > 1 || m_remove_set_pending) {
+    if (m_remove_set_pending) {
       return -EBUSY;
     }
 
+    if (!force) {
+      JournalMetadata::RegisteredClients registered_clients;
+      m_journal_metadata->get_registered_clients(&registered_clients);
+
+      if (registered_clients.size() == 0) {
+	return -EINVAL;
+      } else if (registered_clients.size() > 1) {
+	return -EBUSY;
+      }
+    }
+
     m_remove_set = std::numeric_limits<uint64_t>::max();
     m_remove_set_pending = true;
     m_remove_set_ctx = &ctx;
diff --git a/src/journal/JournalTrimmer.h b/src/journal/JournalTrimmer.h
index 9f557a7..46db1c5 100644
--- a/src/journal/JournalTrimmer.h
+++ b/src/journal/JournalTrimmer.h
@@ -22,7 +22,7 @@ public:
                  const JournalMetadataPtr &journal_metadata);
   ~JournalTrimmer();
 
-  int remove_objects();
+  int remove_objects(bool force);
   void committed(uint64_t commit_tid);
 
 private:
diff --git a/src/journal/Journaler.cc b/src/journal/Journaler.cc
index 83862fc..25a50b8 100644
--- a/src/journal/Journaler.cc
+++ b/src/journal/Journaler.cc
@@ -42,6 +42,15 @@ struct C_DeleteRecorder : public Context {
 
 using namespace cls::journal;
 
+std::string Journaler::header_oid(const std::string &journal_id) {
+  return JOURNAL_HEADER_PREFIX + journal_id;
+}
+
+std::string Journaler::object_oid_prefix(int pool_id,
+					 const std::string &journal_id) {
+  return JOURNAL_OBJECT_PREFIX + stringify(pool_id) + "." + journal_id + ".";
+}
+
 Journaler::Journaler(librados::IoCtx &header_ioctx,
 		     const std::string &journal_id,
 		     const std::string &client_id, double commit_interval)
@@ -51,8 +60,8 @@ Journaler::Journaler(librados::IoCtx &header_ioctx,
   m_header_ioctx.dup(header_ioctx);
   m_cct = reinterpret_cast<CephContext *>(m_header_ioctx.cct());
 
-  m_header_oid = JOURNAL_HEADER_PREFIX + journal_id;
-  m_object_oid_prefix = JOURNAL_OBJECT_PREFIX + journal_id + ".";
+  m_header_oid = header_oid(journal_id);
+  m_object_oid_prefix = object_oid_prefix(m_header_ioctx.get_id(), journal_id);
 
   m_metadata = new JournalMetadata(m_header_ioctx, m_header_oid, m_client_id,
                                    commit_interval);
@@ -69,6 +78,16 @@ Journaler::~Journaler() {
   assert(m_recorder == NULL);
 }
 
+int Journaler::exists(bool *header_exists) const {
+  int r = m_header_ioctx.stat(m_header_oid, NULL, NULL);
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+
+  *header_exists = (r == 0);
+  return 0;
+}
+
 void Journaler::init(Context *on_init) {
   m_metadata->init(new C_InitJournaler(this, on_init));
 }
@@ -97,6 +116,10 @@ int Journaler::init_complete() {
   return 0;
 }
 
+void Journaler::shutdown() {
+  m_metadata->shutdown();
+}
+
 int Journaler::create(uint8_t order, uint8_t splay_width, int64_t pool_id) {
   if (order > 64 || order < 12) {
     lderr(m_cct) << "order must be in the range [12, 64]" << dendl;
@@ -116,10 +139,11 @@ int Journaler::create(uint8_t order, uint8_t splay_width, int64_t pool_id) {
   return 0;
 }
 
-int Journaler::remove() {
+int Journaler::remove(bool force) {
   m_metadata->shutdown();
 
-  int r = m_trimmer->remove_objects();
+  ldout(m_cct, 5) << "removing journal: " << m_header_oid << dendl;
+  int r = m_trimmer->remove_objects(force);
   if (r < 0) {
     lderr(m_cct) << "failed to remove journal objects: " << cpp_strerror(r)
                  << dendl;
@@ -154,7 +178,8 @@ void Journaler::start_live_replay(ReplayHandler *replay_handler,
   m_player->prefetch_and_watch(interval);
 }
 
-bool Journaler::try_pop_front(ReplayEntry *replay_entry) {
+bool Journaler::try_pop_front(ReplayEntry *replay_entry,
+			      std::string* tag) {
   assert(m_player != NULL);
 
   Entry entry;
@@ -164,6 +189,9 @@ bool Journaler::try_pop_front(ReplayEntry *replay_entry) {
   }
 
   *replay_entry = ReplayEntry(entry.get_data(), commit_tid);
+  if (tag != NULL) {
+    *tag = entry.get_tag();
+  }
   return true;
 }
 
@@ -215,6 +243,15 @@ void Journaler::create_player(ReplayHandler *replay_handler) {
                                replay_handler);
 }
 
+void Journaler::get_metadata(uint8_t *order, uint8_t *splay_width,
+			     int64_t *pool_id) {
+  assert(m_metadata != NULL);
+
+  *order = m_metadata->get_order();
+  *splay_width = m_metadata->get_splay_width();
+  *pool_id = m_metadata->get_pool_id();
+}
+
 std::ostream &operator<<(std::ostream &os,
 			 const Journaler &journaler) {
   os << "[metadata=";
diff --git a/src/journal/Journaler.h b/src/journal/Journaler.h
index d358218..27f77c7 100644
--- a/src/journal/Journaler.h
+++ b/src/journal/Journaler.h
@@ -5,7 +5,7 @@
 #define CEPH_JOURNAL_JOURNALER_H
 
 #include "include/int_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/Context.h"
 #include "include/rados/librados.hpp"
 #include "journal/Future.h"
@@ -26,21 +26,28 @@ class ReplayHandler;
 
 class Journaler {
 public:
+
+  static std::string header_oid(const std::string &journal_id);
+  static std::string object_oid_prefix(int pool_id,
+				       const std::string &journal_id);
+
   Journaler(librados::IoCtx &header_ioctx, const std::string &journal_id,
 	    const std::string &client_id, double commit_interval);
   ~Journaler();
 
+  int exists(bool *header_exists) const;
   int create(uint8_t order, uint8_t splay_width, int64_t pool_id);
-  int remove();
+  int remove(bool force);
 
   void init(Context *on_init);
+  void shutdown();
 
   int register_client(const std::string &description);
   int unregister_client();
 
   void start_replay(ReplayHandler *replay_handler);
   void start_live_replay(ReplayHandler *replay_handler, double interval);
-  bool try_pop_front(ReplayEntry *replay_entry);
+  bool try_pop_front(ReplayEntry *replay_entry, std::string* tag = NULL);
   void stop_replay();
 
   void start_append(int flush_interval, uint64_t flush_bytes, double flush_age);
@@ -51,6 +58,8 @@ public:
   void committed(const ReplayEntry &replay_entry);
   void committed(const Future &future);
 
+  void get_metadata(uint8_t *order, uint8_t *splay_width, int64_t *pool_id);
+
 private:
   struct C_InitJournaler : public Context {
     Journaler *journaler;
@@ -66,7 +75,7 @@ private:
     }
   };
 
-  librados::IoCtx m_header_ioctx;
+  mutable librados::IoCtx m_header_ioctx;
   librados::IoCtx m_data_ioctx;
   CephContext *m_cct;
   std::string m_client_id;
diff --git a/src/journal/ObjectPlayer.cc b/src/journal/ObjectPlayer.cc
index 939722e..9d58d8e 100644
--- a/src/journal/ObjectPlayer.cc
+++ b/src/journal/ObjectPlayer.cc
@@ -29,6 +29,7 @@ ObjectPlayer::ObjectPlayer(librados::IoCtx &ioctx,
 
 ObjectPlayer::~ObjectPlayer() {
   {
+    Mutex::Locker timer_locker(m_timer_lock);
     Mutex::Locker locker(m_lock);
     assert(!m_fetch_in_progress);
     assert(m_watch_ctx == NULL);
@@ -44,6 +45,7 @@ void ObjectPlayer::fetch(Context *on_finish) {
   C_Fetch *context = new C_Fetch(this, on_finish);
   librados::ObjectReadOperation op;
   op.read(m_read_off, 2 << m_order, &context->read_bl, NULL);
+  op.set_op_flags2(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
 
   librados::AioCompletion *rados_completion =
     librados::Rados::aio_create_completion(context, utils::rados_ctx_callback,
@@ -59,7 +61,6 @@ void ObjectPlayer::watch(Context *on_fetch, double interval) {
   Mutex::Locker timer_locker(m_timer_lock);
   m_watch_interval = interval;
 
-  Mutex::Locker locker(m_lock);
   assert(m_watch_ctx == NULL);
   m_watch_ctx = on_fetch;
 
@@ -69,16 +70,12 @@ void ObjectPlayer::watch(Context *on_fetch, double interval) {
 void ObjectPlayer::unwatch() {
   ldout(m_cct, 20) << __func__ << ": " << m_oid << " unwatch" << dendl;
   Mutex::Locker timer_locker(m_timer_lock);
-  Mutex::Locker locker(m_lock);
-
   cancel_watch();
 
   m_watch_ctx = NULL;
-  m_timer_lock.Unlock();
   while (m_watch_in_progress) {
-    m_watch_in_progress_cond.Wait(m_lock);
+    m_watch_in_progress_cond.Wait(m_timer_lock);
   }
-  m_timer_lock.Lock();
 }
 
 void ObjectPlayer::front(Entry *entry) const {
@@ -170,7 +167,6 @@ int ObjectPlayer::handle_fetch_complete(int r, const bufferlist &bl) {
 
 void ObjectPlayer::schedule_watch() {
   assert(m_timer_lock.is_locked());
-  assert(m_lock.is_locked());
   if (m_watch_ctx == NULL) {
     return;
   }
@@ -194,13 +190,10 @@ void ObjectPlayer::handle_watch_task() {
   assert(m_timer_lock.is_locked());
 
   ldout(m_cct, 10) << __func__ << ": " << m_oid << " polling" << dendl;
-  {
-    Mutex::Locker locker(m_lock);
-    assert(m_watch_ctx != NULL);
+  assert(m_watch_ctx != NULL);
 
-    m_watch_in_progress = true;
-    m_watch_task = NULL;
-  }
+  m_watch_in_progress = true;
+  m_watch_task = NULL;
   fetch(new C_WatchFetch(this));
 }
 
@@ -211,7 +204,6 @@ void ObjectPlayer::handle_watch_fetched(int r) {
   Context *on_finish = NULL;
   {
     Mutex::Locker timer_locker(m_timer_lock);
-    Mutex::Locker locker(m_lock);
     assert(m_watch_in_progress);
     if (r == -ENOENT) {
       schedule_watch();
@@ -226,7 +218,7 @@ void ObjectPlayer::handle_watch_fetched(int r) {
   }
 
   {
-    Mutex::Locker locker(m_lock);
+    Mutex::Locker locker(m_timer_lock);
     m_watch_in_progress = false;
     m_watch_in_progress_cond.Signal();
   }
diff --git a/src/journal/ObjectRecorder.cc b/src/journal/ObjectRecorder.cc
index cf96b94..ba8ff4b 100644
--- a/src/journal/ObjectRecorder.cc
+++ b/src/journal/ObjectRecorder.cc
@@ -30,7 +30,7 @@ ObjectRecorder::ObjectRecorder(librados::IoCtx &ioctx, const std::string &oid,
     m_append_task(NULL),
     m_lock(utils::unique_lock_name("ObjectRecorder::m_lock", this)),
     m_append_tid(0), m_pending_bytes(0), m_size(0), m_overflowed(false),
-    m_object_closed(false) {
+    m_object_closed(false), m_in_flight_flushes(false) {
   m_ioctx.dup(ioctx);
   m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
   assert(m_overflow_handler != NULL);
@@ -44,11 +44,12 @@ ObjectRecorder::~ObjectRecorder() {
 
 bool ObjectRecorder::append(const AppendBuffers &append_buffers) {
   FutureImplPtr last_flushed_future;
+  bool schedule_append = false;
   {
     Mutex::Locker locker(m_lock);
     for (AppendBuffers::const_iterator iter = append_buffers.begin();
          iter != append_buffers.end(); ++iter) {
-      if (append(*iter)) {
+      if (append(*iter, &schedule_append)) {
         last_flushed_future = iter->first;
       }
     }
@@ -56,6 +57,10 @@ bool ObjectRecorder::append(const AppendBuffers &append_buffers) {
 
   if (last_flushed_future) {
     flush(last_flushed_future);
+  } else if (schedule_append) {
+    schedule_append_task();
+  } else {
+    cancel_append_task();
   }
   return (m_size + m_pending_bytes >= m_soft_max_size);
 }
@@ -63,10 +68,18 @@ bool ObjectRecorder::append(const AppendBuffers &append_buffers) {
 void ObjectRecorder::flush(Context *on_safe) {
   ldout(m_cct, 20) << __func__ << ": " << m_oid << dendl;
 
+  cancel_append_task();
   Future future;
   {
     Mutex::Locker locker(m_lock);
 
+    // if currently handling flush notifications, wait so that
+    // we notify in the correct order (since lock is dropped on
+    // callback)
+    if (m_in_flight_flushes) {
+      m_in_flight_flushes_cond.Wait(m_lock);
+    }
+
     // attach the flush to the most recent append
     if (!m_append_buffers.empty()) {
       future = Future(m_append_buffers.rbegin()->first);
@@ -77,7 +90,6 @@ void ObjectRecorder::flush(Context *on_safe) {
       assert(!append_buffers.empty());
       future = Future(append_buffers.rbegin()->first);
     }
-    cancel_append_task();
   }
 
   if (future.is_valid()) {
@@ -130,11 +142,11 @@ void ObjectRecorder::claim_append_buffers(AppendBuffers *append_buffers) {
 bool ObjectRecorder::close_object() {
   ldout(m_cct, 20) << __func__ << ": " << m_oid << dendl;
 
+  cancel_append_task();
+
   Mutex::Locker locker(m_lock);
   m_object_closed = true;
-  if (flush_appends(true)) {
-    cancel_append_task();
-  }
+  flush_appends(true);
   return m_in_flight_appends.empty();
 }
 
@@ -162,17 +174,16 @@ void ObjectRecorder::schedule_append_task() {
   }
 }
 
-bool ObjectRecorder::append(const AppendBuffer &append_buffer) {
+bool ObjectRecorder::append(const AppendBuffer &append_buffer,
+                            bool *schedule_append) {
   assert(m_lock.is_locked());
 
   bool flush_requested = append_buffer.first->attach(&m_flush_handler);
   m_append_buffers.push_back(append_buffer);
   m_pending_bytes += append_buffer.second.length();
 
-  if (flush_appends(false)) {
-    cancel_append_task();
-  } else {
-    schedule_append_task();
+  if (!flush_appends(false)) {
+    *schedule_append = true;
   }
   return flush_requested;
 }
@@ -202,21 +213,31 @@ void ObjectRecorder::handle_append_flushed(uint64_t tid, int r) {
   ldout(m_cct, 10) << __func__ << ": " << m_oid << " tid=" << tid
                    << ", r=" << r << dendl;
 
-  Mutex::Locker locker(m_lock);
-  InFlightAppends::iterator iter = m_in_flight_appends.find(tid);
-  if (iter == m_in_flight_appends.end()) {
-    // must have seen an overflow on a previous append op
-    assert(m_overflowed);
-    return;
-  } else if (r == -EOVERFLOW) {
-    m_overflowed = true;
-    append_overflowed(tid);
-    return;
-  }
+  AppendBuffers append_buffers;
+  {
+    Mutex::Locker locker(m_lock);
+    InFlightAppends::iterator iter = m_in_flight_appends.find(tid);
+    if (iter == m_in_flight_appends.end()) {
+      // must have seen an overflow on a previous append op
+      assert(m_overflowed);
+      return;
+    } else if (r == -EOVERFLOW) {
+      m_overflowed = true;
+      append_overflowed(tid);
+      return;
+    }
 
-  assert(!m_overflowed || r != 0);
-  AppendBuffers &append_buffers = iter->second;
-  assert(!append_buffers.empty());
+    assert(!m_overflowed || r != 0);
+    append_buffers.swap(iter->second);
+    assert(!append_buffers.empty());
+
+    m_in_flight_appends.erase(iter);
+    if (m_in_flight_appends.empty() && m_object_closed) {
+      // all remaining unsent appends should be redirected to new object
+      notify_overflow();
+    }
+    m_in_flight_flushes = true;
+  }
 
   // Flag the associated futures as complete.
   for (AppendBuffers::iterator buf_it = append_buffers.begin();
@@ -225,12 +246,11 @@ void ObjectRecorder::handle_append_flushed(uint64_t tid, int r) {
                      << dendl;
     buf_it->first->safe(r);
   }
-  m_in_flight_appends.erase(iter);
 
-  if (m_in_flight_appends.empty() && m_object_closed) {
-    // all remaining unsent appends should be redirected to new object
-    notify_overflow();
-  }
+  // wake up any flush requests that raced with a RADOS callback
+  Mutex::Locker locker(m_lock);
+  m_in_flight_flushes = false;
+  m_in_flight_flushes_cond.Signal();
 }
 
 void ObjectRecorder::append_overflowed(uint64_t tid) {
@@ -279,6 +299,7 @@ void ObjectRecorder::send_appends(AppendBuffers *append_buffers) {
                      << dendl;
     it->first->set_flush_in_progress();
     op.append(it->second);
+    op.set_op_flags2(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
     m_size += it->second.length();
   }
   m_in_flight_appends[append_tid].swap(*append_buffers);
diff --git a/src/journal/ObjectRecorder.h b/src/journal/ObjectRecorder.h
index 566c41f..2cc8541 100644
--- a/src/journal/ObjectRecorder.h
+++ b/src/journal/ObjectRecorder.h
@@ -6,6 +6,7 @@
 
 #include "include/Context.h"
 #include "include/rados/librados.hpp"
+#include "common/Cond.h"
 #include "common/Mutex.h"
 #include "common/RefCountedObj.h"
 #include "journal/FutureImpl.h"
@@ -131,11 +132,14 @@ private:
 
   bufferlist m_prefetch_bl;
 
+  bool m_in_flight_flushes;
+  Cond m_in_flight_flushes_cond;
+
   void handle_append_task();
   void cancel_append_task();
   void schedule_append_task();
 
-  bool append(const AppendBuffer &append_buffer);
+  bool append(const AppendBuffer &append_buffer, bool *schedule_append);
   bool flush_appends(bool force);
   void handle_append_flushed(uint64_t tid, int r);
   void append_overflowed(uint64_t tid);
diff --git a/src/kv/KineticStore.h b/src/kv/KineticStore.h
index 657dfeb..f275b89 100644
--- a/src/kv/KineticStore.h
+++ b/src/kv/KineticStore.h
@@ -4,7 +4,7 @@
 #define KINETIC_STORE_H
 
 #include "include/types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "KeyValueDB.h"
 #include <set>
 #include <map>
diff --git a/src/kv/LevelDBStore.h b/src/kv/LevelDBStore.h
index c269601..2ce421c 100644
--- a/src/kv/LevelDBStore.h
+++ b/src/kv/LevelDBStore.h
@@ -4,7 +4,7 @@
 #define LEVEL_DB_STORE_H
 
 #include "include/types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "KeyValueDB.h"
 #include <set>
 #include <map>
diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h
index 90523c4..eb2f157 100644
--- a/src/kv/RocksDBStore.h
+++ b/src/kv/RocksDBStore.h
@@ -4,7 +4,7 @@
 #define ROCKS_DB_STORE_H
 
 #include "include/types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "KeyValueDB.h"
 #include <set>
 #include <map>
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 403b5b0..ab3486d 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -3446,13 +3446,18 @@ extern "C" int rados_getxattrs_next(rados_xattrs_iter_t iter,
   *name = s.c_str();
   bufferlist &bl(it->i->second);
   size_t bl_len = bl.length();
-  it->val = (char*)malloc(bl_len);
-  if (!it->val) {
-    tracepoint(librados, rados_getxattrs_next_exit, -ENOMEM, *name, NULL, 0);
-    return -ENOMEM;
+  if (!bl_len) {
+    // malloc(0) is not guaranteed to return a valid pointer
+    *val = (char *)NULL;
+  } else {
+    it->val = (char*)malloc(bl_len);
+    if (!it->val) {
+      tracepoint(librados, rados_getxattrs_next_exit, -ENOMEM, *name, NULL, 0);
+      return -ENOMEM;
+    }
+    memcpy(it->val, bl.c_str(), bl_len);
+    *val = it->val;
   }
-  memcpy(it->val, bl.c_str(), bl_len);
-  *val = it->val;
   *len = bl_len;
   ++it->i;
   tracepoint(librados, rados_getxattrs_next_exit, 0, *name, *val, *len);
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index ac6754d..9ad318b 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -28,12 +28,6 @@
 
 namespace librbd {
 
-  void AioCompletion::finish_adding_requests(CephContext *cct)
-  {
-    ldout(cct, 20) << "AioCompletion::finish_adding_requests " << (void*)this << " pending " << pending_count << dendl;
-    unblock(cct);
-  }
-
   int AioCompletion::wait_for_complete() {
     tracepoint(librbd, aio_wait_for_complete_enter, this);
     lock.Lock();
@@ -46,8 +40,9 @@ namespace librbd {
 
   void AioCompletion::finalize(CephContext *cct, ssize_t rval)
   {
-    ldout(cct, 20) << "AioCompletion::finalize() " << (void*)this << " rval " << rval << " read_buf " << (void*)read_buf
-		   << " read_bl " << (void*)read_bl << dendl;
+    ldout(cct, 20) << this << " " << __func__ << ": r=" << rval << ", "
+                   << "read_buf=" << reinterpret_cast<void*>(read_buf) << ", "
+                   << "real_bl=" <<  reinterpret_cast<void*>(read_bl) << dendl;
     if (rval >= 0 && aio_type == AIO_TYPE_READ) {
       // FIXME: make the destriper write directly into a buffer so
       // that we avoid shuffling pointers and copying zeros around.
@@ -57,11 +52,11 @@ namespace librbd {
       if (read_buf) {
 	assert(bl.length() == read_buf_len);
 	bl.copy(0, read_buf_len, read_buf);
-	ldout(cct, 20) << "AioCompletion::finalize() copied resulting " << bl.length()
+	ldout(cct, 20) << "copied resulting " << bl.length()
 		       << " bytes to " << (void*)read_buf << dendl;
       }
       if (read_bl) {
-	ldout(cct, 20) << "AioCompletion::finalize() moving resulting " << bl.length()
+	ldout(cct, 20) << "moving resulting " << bl.length()
 		       << " bytes to bl " << (void*)read_bl << dendl;
 	read_bl->claim(bl);
       }
@@ -90,10 +85,11 @@ namespace librbd {
     // inform the journal that the op has successfully committed
     if (journal_tid != 0) {
       assert(ictx->journal != NULL);
-      ictx->journal->commit_event(journal_tid, rval);
+      ictx->journal->commit_io_event(journal_tid, rval);
     }
 
     // note: possible for image to be closed after op marked finished
+    done = true;
     if (async_op.started()) {
       async_op.finish_op();
     }
@@ -103,7 +99,13 @@ namespace librbd {
       complete_cb(rbd_comp, complete_arg);
       lock.Lock();
     }
-    done = true;
+
+    if (ictx && event_notify && ictx->event_socket.is_valid()) {
+      ictx->completed_reqs_lock.Lock();
+      ictx->completed_reqs.push_back(&m_xlist_item);
+      ictx->completed_reqs_lock.Unlock();
+      ictx->event_socket.notify();
+    }
     cond.Signal();
     tracepoint(librbd, aio_complete_exit);
   }
@@ -118,14 +120,16 @@ namespace librbd {
 
   void AioCompletion::start_op(ImageCtx *i, aio_type_t t) {
     init_time(i, t);
-    if (!async_op.started()) {
+
+    Mutex::Locker locker(lock);
+    if (!done && !async_op.started()) {
       async_op.start_op(*ictx);
     }
   }
 
   void AioCompletion::fail(CephContext *cct, int r)
   {
-    lderr(cct) << "AioCompletion::fail() " << this << ": " << cpp_strerror(r)
+    lderr(cct) << this << " " << __func__ << ": " << cpp_strerror(r)
                << dendl;
     lock.Lock();
     assert(pending_count == 0);
@@ -134,11 +138,19 @@ namespace librbd {
     put_unlock();
   }
 
+  void AioCompletion::set_request_count(CephContext *cct, uint32_t count) {
+    ldout(cct, 20) << this << " " << __func__ << ": pending=" << count << dendl;
+    lock.Lock();
+    assert(pending_count == 0);
+    pending_count = count;
+    lock.Unlock();
+
+    // if no pending requests, completion will fire now
+    unblock(cct);
+  }
+
   void AioCompletion::complete_request(CephContext *cct, ssize_t r)
   {
-    ldout(cct, 20) << "AioCompletion::complete_request() "
-		   << (void *)this << " complete_cb=" << (void *)complete_cb
-		   << " pending " << pending_count << dendl;
     lock.Lock();
     if (rval >= 0) {
       if (r < 0 && r != -EEXIST)
@@ -148,6 +160,9 @@ namespace librbd {
     }
     assert(pending_count);
     int count = --pending_count;
+
+    ldout(cct, 20) << this << " " << __func__ << ": cb=" << complete_cb << ", "
+                   << "pending=" << pending_count << dendl;
     if (!count && blockers == 0) {
       finalize(cct, rval);
       complete(cct);
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index 532f7e2..0d1e226 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -10,6 +10,7 @@
 #include "include/rbd/librbd.hpp"
 
 #include "librbd/AsyncOperation.h"
+#include "librbd/ImageCtx.h"
 
 #include "osdc/Striper.h"
 
@@ -34,10 +35,11 @@ namespace librbd {
    *
    * The retrying of individual requests is handled at a lower level,
    * so all AioCompletion cares about is the count of outstanding
-   * requests. Note that this starts at 1 to prevent the reference
-   * count from reaching 0 while more requests are being added. When
-   * all requests have been added, finish_adding_requests() releases
-   * this initial reference.
+   * requests. The number of expected individual requests should be
+   * set initially using set_request_count() prior to issuing the
+   * requests.  This ensures that the completion will not be completed
+   * within the caller's thread of execution (instead via a librados
+   * context or via a thread pool context for cache read hits).
    */
   struct AioCompletion {
     Mutex lock;
@@ -47,7 +49,7 @@ namespace librbd {
     callback_t complete_cb;
     void *complete_arg;
     rbd_completion_t rbd_comp;
-    int pending_count;   ///< number of requests
+    uint32_t pending_count;   ///< number of requests
     uint32_t blockers;
     int ref;
     bool released;
@@ -63,6 +65,32 @@ namespace librbd {
     AsyncOperation async_op;
 
     uint64_t journal_tid;
+    xlist<AioCompletion*>::item m_xlist_item;
+    bool event_notify;
+
+    template <typename T, void (T::*MF)(int)>
+    static void callback_adapter(completion_t cb, void *arg) {
+      AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
+      T *t = reinterpret_cast<T *>(arg);
+      (t->*MF)(comp->get_return_value());
+      comp->release();
+    }
+
+    static AioCompletion *create(void *cb_arg, callback_t cb_complete,
+                                 rbd_completion_t rbd_comp) {
+      AioCompletion *comp = new AioCompletion();
+      comp->set_complete_cb(cb_arg, cb_complete);
+      comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp);
+      return comp;
+    }
+
+    template <typename T, void (T::*MF)(int) = &T::complete>
+    static AioCompletion *create(T *obj) {
+      AioCompletion *comp = new AioCompletion();
+      comp->set_complete_cb(obj, &callback_adapter<T, MF>);
+      comp->rbd_comp = comp;
+      return comp;
+    }
 
     AioCompletion() : lock("AioCompletion::lock", true, false),
 		      done(false), rval(0), complete_cb(NULL),
@@ -71,24 +99,16 @@ namespace librbd {
 		      ref(1), released(false), ictx(NULL),
 		      aio_type(AIO_TYPE_NONE),
 		      read_bl(NULL), read_buf(NULL), read_buf_len(0),
-                      journal_tid(0) {
+                      journal_tid(0),
+                      m_xlist_item(this), event_notify(false) {
     }
     ~AioCompletion() {
     }
 
     int wait_for_complete();
 
-    void add_request() {
-      lock.Lock();
-      pending_count++;
-      lock.Unlock();
-      get();
-    }
-
     void finalize(CephContext *cct, ssize_t rval);
 
-    void finish_adding_requests(CephContext *cct);
-
     void init_time(ImageCtx *i, aio_type_t t);
     void start_op(ImageCtx *i, aio_type_t t);
     void fail(CephContext *cct, int r);
@@ -100,6 +120,13 @@ namespace librbd {
       complete_arg = cb_arg;
     }
 
+    void set_request_count(CephContext *cct, uint32_t num);
+    void add_request() {
+      lock.Lock();
+      assert(pending_count > 0);
+      lock.Unlock();
+      get();
+    }
     void complete_request(CephContext *cct, ssize_t r);
 
     void associate_journal_event(uint64_t tid);
@@ -128,8 +155,14 @@ namespace librbd {
       assert(ref > 0);
       int n = --ref;
       lock.Unlock();
-      if (!n)
-	delete this;
+      if (!n) {
+        if (ictx && event_notify) {
+          ictx->completed_reqs_lock.Lock();
+          m_xlist_item.remove_myself();
+          ictx->completed_reqs_lock.Unlock();
+        }
+        delete this;
+      }
     }
 
     void block() {
@@ -145,6 +178,15 @@ namespace librbd {
         complete(cct);
       }
     }
+
+    void set_event_notify(bool s) {
+      Mutex::Locker l(lock);
+      event_notify = s;
+    }
+
+    void *get_arg() {
+      return complete_arg;
+    }
   };
 
   class C_AioRequest : public Context {
diff --git a/src/librbd/AioImageRequest.cc b/src/librbd/AioImageRequest.cc
index 49632b6..104148b 100644
--- a/src/librbd/AioImageRequest.cc
+++ b/src/librbd/AioImageRequest.cc
@@ -4,7 +4,9 @@
 #include "librbd/AioImageRequest.h"
 #include "librbd/AioCompletion.h"
 #include "librbd/AioObjectRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
 #include "librbd/Journal.h"
@@ -115,15 +117,14 @@ void AioImageRequest::send() {
                  << "completion=" << m_aio_comp <<  dendl;
 
   m_aio_comp->get();
-  int r = ictx_check(&m_image_ctx, m_image_ctx.owner_lock);
-  if (r < 0) {
-    m_aio_comp->fail(cct, r);
-    return;
-  }
-
   send_request();
 }
 
+void AioImageRequest::fail(int r) {
+  m_aio_comp->get();
+  m_aio_comp->fail(m_image_ctx.cct, r);
+}
+
 void AioImageRead::send_request() {
   CephContext *cct = m_image_ctx.cct;
 
@@ -168,33 +169,39 @@ void AioImageRead::send_request() {
   m_aio_comp->read_buf_len = buffer_ofs;
   m_aio_comp->read_bl = m_pbl;
 
-  for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin();
-       p != object_extents.end(); ++p) {
-    for (vector<ObjectExtent>::iterator q = p->second.begin();
-         q != p->second.end(); ++q) {
-      ldout(cct, 20) << " oid " << q->oid << " " << q->offset << "~"
-                     << q->length << " from " << q->buffer_extents
+  // pre-calculate the expected number of read requests
+  uint32_t request_count = 0;
+  for (auto &object_extent : object_extents) {
+    request_count += object_extent.second.size();
+  }
+  m_aio_comp->set_request_count(cct, request_count);
+
+  // issue the requests
+  for (auto &object_extent : object_extents) {
+    for (auto &extent : object_extent.second) {
+      ldout(cct, 20) << " oid " << extent.oid << " " << extent.offset << "~"
+                     << extent.length << " from " << extent.buffer_extents
                      << dendl;
 
       C_AioRead *req_comp = new C_AioRead(cct, m_aio_comp);
-      AioObjectRead *req = new AioObjectRead(&m_image_ctx, q->oid.name,
-                                             q->objectno, q->offset, q->length,
-                                             q->buffer_extents, snap_id, true,
-                                             req_comp, m_op_flags);
+      AioObjectRead *req = new AioObjectRead(&m_image_ctx, extent.oid.name,
+                                             extent.objectno, extent.offset,
+                                             extent.length,
+                                             extent.buffer_extents, snap_id,
+                                             true, req_comp, m_op_flags);
       req_comp->set_req(req);
 
       if (m_image_ctx.object_cacher) {
         C_CacheRead *cache_comp = new C_CacheRead(&m_image_ctx, req);
-        m_image_ctx.aio_read_from_cache(q->oid, q->objectno, &req->data(),
-                                        q->length, q->offset,
-                                        cache_comp, m_op_flags);
+        m_image_ctx.aio_read_from_cache(extent.oid, extent.objectno,
+                                        &req->data(), extent.length,
+                                        extent.offset, cache_comp, m_op_flags);
       } else {
         req->send();
       }
     }
   }
 
-  m_aio_comp->finish_adding_requests(cct);
   m_aio_comp->put();
 
   m_image_ctx.perfcounter->inc(l_librbd_rd);
@@ -241,24 +248,29 @@ void AbstractAioImageWrite::send_request() {
                   !m_image_ctx.journal->is_journal_replaying());
   }
 
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-          m_image_ctx.image_watcher->is_lock_owner());
+  if (!object_extents.empty()) {
+    m_aio_comp->set_request_count(
+      cct, object_extents.size() + get_cache_request_count(journaling));
 
-  AioObjectRequests requests;
-  send_object_requests(object_extents, snapc, (journaling ? &requests : NULL));
+    AioObjectRequests requests;
+    send_object_requests(object_extents, snapc,
+                         (journaling ? &requests : nullptr));
 
-  if (journaling) {
-    // in-flight ops are flushed prior to closing the journal
-    assert(m_image_ctx.journal != NULL);
-    journal_tid = append_journal_event(requests, m_synchronous);
-  }
+    if (journaling) {
+      // in-flight ops are flushed prior to closing the journal
+      assert(m_image_ctx.journal != NULL);
+      journal_tid = append_journal_event(requests, m_synchronous);
+    }
 
-  if (m_image_ctx.object_cacher != NULL) {
-    send_cache_requests(object_extents, journal_tid);
+    if (m_image_ctx.object_cacher != NULL) {
+      send_cache_requests(object_extents, journal_tid);
+    }
+  } else {
+    // no IO to perform -- fire completion
+    m_aio_comp->unblock(cct);
   }
-  update_stats(clip_len);
 
-  m_aio_comp->finish_adding_requests(cct);
+  update_stats(clip_len);
   m_aio_comp->put();
 }
 
@@ -299,9 +311,9 @@ uint64_t AioImageWrite::append_journal_event(
   bl.append(m_buf, m_len);
 
   journal::EventEntry event_entry(journal::AioWriteEvent(m_off, m_len, bl));
-  uint64_t tid = m_image_ctx.journal->append_event(m_aio_comp, event_entry,
-                                                   requests, m_off, m_len,
-                                                   synchronous);
+  uint64_t tid = m_image_ctx.journal->append_io_event(m_aio_comp, event_entry,
+                                                      requests, m_off, m_len,
+                                                      synchronous);
   if (m_image_ctx.object_cacher == NULL) {
     m_aio_comp->associate_journal_event(tid);
   }
@@ -359,13 +371,18 @@ void AioImageWrite::update_stats(size_t length) {
 uint64_t AioImageDiscard::append_journal_event(
     const AioObjectRequests &requests, bool synchronous) {
   journal::EventEntry event_entry(journal::AioDiscardEvent(m_off, m_len));
-  uint64_t tid = m_image_ctx.journal->append_event(m_aio_comp, event_entry,
-                                                   requests, m_off, m_len,
-                                                   synchronous);
+  uint64_t tid = m_image_ctx.journal->append_io_event(m_aio_comp, event_entry,
+                                                      requests, m_off, m_len,
+                                                      synchronous);
   m_aio_comp->associate_journal_event(tid);
   return tid;
 }
 
+uint32_t AioImageDiscard::get_cache_request_count(bool journaling) const {
+  // extra completion request is required for tracking journal commit
+  return (m_image_ctx.object_cacher != nullptr && journaling ? 1 : 0);
+}
+
 void AioImageDiscard::send_cache_requests(const ObjectExtents &object_extents,
                                           uint64_t journal_tid) {
   if (journal_tid == 0) {
@@ -415,28 +432,32 @@ void AioImageDiscard::update_stats(size_t length) {
 void AioImageFlush::send_request() {
   CephContext *cct = m_image_ctx.cct;
 
+  bool journaling = false;
   {
-    // journal the flush event
     RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-    if (m_image_ctx.journal != NULL &&
-        !m_image_ctx.journal->is_journal_replaying()) {
-      uint64_t journal_tid = m_image_ctx.journal->append_event(
-        m_aio_comp, journal::EventEntry(journal::AioFlushEvent()),
-        AioObjectRequests(), 0, 0, false);
-
-      C_FlushJournalCommit *ctx = new C_FlushJournalCommit(m_image_ctx,
-                                                           m_aio_comp,
-                                                           journal_tid);
-      m_image_ctx.journal->flush_event(journal_tid, ctx);
-      m_aio_comp->associate_journal_event(journal_tid);
-    }
+    journaling = (m_image_ctx.journal != NULL &&
+                  !m_image_ctx.journal->is_journal_replaying());
+  }
+
+  m_aio_comp->set_request_count(cct, journaling ? 2 : 1);
+
+  if (journaling) {
+    // in-flight ops are flushed prior to closing the journal
+    uint64_t journal_tid = m_image_ctx.journal->append_io_event(
+      m_aio_comp, journal::EventEntry(journal::AioFlushEvent()),
+      AioObjectRequests(), 0, 0, false);
+
+    C_FlushJournalCommit *ctx = new C_FlushJournalCommit(m_image_ctx,
+                                                         m_aio_comp,
+                                                         journal_tid);
+    m_image_ctx.journal->flush_event(journal_tid, ctx);
+    m_aio_comp->associate_journal_event(journal_tid);
   }
 
   C_AioRequest *req_comp = new C_AioRequest(cct, m_aio_comp);
   m_image_ctx.flush(req_comp);
 
   m_aio_comp->start_op(&m_image_ctx, AIO_TYPE_FLUSH);
-  m_aio_comp->finish_adding_requests(cct);
   m_aio_comp->put();
 
   m_image_ctx.perfcounter->inc(l_librbd_aio_flush);
diff --git a/src/librbd/AioImageRequest.h b/src/librbd/AioImageRequest.h
index c6037e6..6ee6d64 100644
--- a/src/librbd/AioImageRequest.h
+++ b/src/librbd/AioImageRequest.h
@@ -5,7 +5,7 @@
 #define CEPH_LIBRBD_AIO_IMAGE_REQUEST_H
 
 #include "include/int_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "common/snap_types.h"
 #include "osd/osd_types.h"
 #include "librbd/AioCompletion.h"
@@ -40,6 +40,7 @@ public:
   }
 
   void send();
+  void fail(int r);
 
 protected:
   typedef std::list<AioObjectRequest *> AioObjectRequests;
@@ -108,6 +109,9 @@ protected:
 
   virtual void send_request();
 
+  virtual uint32_t get_cache_request_count(bool journaling) const {
+    return 0;
+  }
   virtual void send_cache_requests(const ObjectExtents &object_extents,
                                    uint64_t journal_tid) = 0;
 
@@ -177,6 +181,7 @@ protected:
     return "aio_discard";
   }
 
+  virtual uint32_t get_cache_request_count(bool journaling) const override;
   virtual void send_cache_requests(const ObjectExtents &object_extents,
                                    uint64_t journal_tid);
 
diff --git a/src/librbd/AioImageRequestWQ.cc b/src/librbd/AioImageRequestWQ.cc
index 7898653..9871bb0 100644
--- a/src/librbd/AioImageRequestWQ.cc
+++ b/src/librbd/AioImageRequestWQ.cc
@@ -2,10 +2,14 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "librbd/AioImageRequestWQ.h"
+#include "common/errno.h"
 #include "librbd/AioCompletion.h"
 #include "librbd/AioImageRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
 #include "librbd/internal.h"
+#include "librbd/Utils.h"
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -16,15 +20,16 @@ namespace librbd {
 AioImageRequestWQ::AioImageRequestWQ(ImageCtx *image_ctx, const string &name,
                                      time_t ti, ThreadPool *tp)
   : ThreadPool::PointerWQ<AioImageRequest>(name, ti, 0, tp),
-    m_image_ctx(*image_ctx), m_lock("AioImageRequestWQ::m_lock"),
+    m_image_ctx(*image_ctx),
+    m_lock(util::unique_lock_name("AioImageRequestWQ::m_lock", this)),
     m_write_blockers(0), m_in_progress_writes(0), m_queued_writes(0),
-    m_lock_listener(this), m_blocking_writes(false) {
-
+    m_in_flight_ops(0), m_refresh_in_progress(false),
+    m_shutdown(false), m_on_shutdown(nullptr) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 5) << this << " " << ": ictx=" << image_ctx << dendl;
 }
 
-ssize_t AioImageRequestWQ::read(uint64_t off, size_t len, char *buf,
+ssize_t AioImageRequestWQ::read(uint64_t off, uint64_t len, char *buf,
                                 int op_flags) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "read: ictx=" << &m_image_ctx << ", off=" << off << ", "
@@ -34,12 +39,12 @@ ssize_t AioImageRequestWQ::read(uint64_t off, size_t len, char *buf,
   image_extents.push_back(make_pair(off, len));
 
   C_SaferCond cond;
-  AioCompletion *c = aio_create_completion_internal(&cond, rbd_ctx_cb);
-  aio_read(c, off, len, buf, NULL, op_flags);
+  AioCompletion *c = AioCompletion::create(&cond);
+  aio_read(c, off, len, buf, NULL, op_flags, false);
   return cond.wait();
 }
 
-ssize_t AioImageRequestWQ::write(uint64_t off, size_t len, const char *buf,
+ssize_t AioImageRequestWQ::write(uint64_t off, uint64_t len, const char *buf,
                                  int op_flags) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "write: ictx=" << &m_image_ctx << ", off=" << off << ", "
@@ -53,8 +58,8 @@ ssize_t AioImageRequestWQ::write(uint64_t off, size_t len, const char *buf,
   }
 
   C_SaferCond cond;
-  AioCompletion *c = aio_create_completion_internal(&cond, rbd_ctx_cb);
-  aio_write(c, off, len, buf, op_flags);
+  AioCompletion *c = AioCompletion::create(&cond);
+  aio_write(c, off, len, buf, op_flags, false);
 
   r = cond.wait();
   if (r < 0) {
@@ -76,8 +81,8 @@ int AioImageRequestWQ::discard(uint64_t off, uint64_t len) {
   }
 
   C_SaferCond cond;
-  AioCompletion *c = aio_create_completion_internal(&cond, rbd_ctx_cb);
-  aio_discard(c, off, len);
+  AioCompletion *c = AioCompletion::create(&cond);
+  aio_discard(c, off, len, false);
 
   r = cond.wait();
   if (r < 0) {
@@ -86,83 +91,153 @@ int AioImageRequestWQ::discard(uint64_t off, uint64_t len) {
   return len;
 }
 
-void AioImageRequestWQ::aio_read(AioCompletion *c, uint64_t off, size_t len,
-                                 char *buf, bufferlist *pbl, int op_flags) {
+void AioImageRequestWQ::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
+                                 char *buf, bufferlist *pbl, int op_flags,
+                                 bool native_async) {
   c->init_time(&m_image_ctx, librbd::AIO_TYPE_READ);
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "aio_read: ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", "
                  << "len=" << len << ", " << "flags=" << op_flags << dendl;
 
+  if (native_async && m_image_ctx.event_socket.is_valid()) {
+    c->set_event_notify(true);
+  }
+
+  if (!start_in_flight_op(c)) {
+    return;
+  }
+
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  if (m_image_ctx.non_blocking_aio) {
+  if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty()) {
     queue(new AioImageRead(m_image_ctx, c, off, len, buf, pbl, op_flags));
   } else {
     AioImageRequest::aio_read(&m_image_ctx, c, off, len, buf, pbl, op_flags);
+    finish_in_flight_op();
   }
 }
 
-void AioImageRequestWQ::aio_write(AioCompletion *c, uint64_t off, size_t len,
-                                  const char *buf, int op_flags) {
+void AioImageRequestWQ::aio_write(AioCompletion *c, uint64_t off, uint64_t len,
+                                  const char *buf, int op_flags,
+                                  bool native_async) {
   c->init_time(&m_image_ctx, librbd::AIO_TYPE_WRITE);
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "aio_write: ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", "
                  << "len=" << len << ", flags=" << op_flags << dendl;
 
+  if (native_async && m_image_ctx.event_socket.is_valid()) {
+    c->set_event_notify(true);
+  }
+
+  if (!start_in_flight_op(c)) {
+    return;
+  }
+
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || is_journal_required() ||
       writes_blocked()) {
     queue(new AioImageWrite(m_image_ctx, c, off, len, buf, op_flags));
   } else {
     AioImageRequest::aio_write(&m_image_ctx, c, off, len, buf, op_flags);
+    finish_in_flight_op();
   }
 }
 
 void AioImageRequestWQ::aio_discard(AioCompletion *c, uint64_t off,
-                                    uint64_t len) {
+                                    uint64_t len, bool native_async) {
   c->init_time(&m_image_ctx, librbd::AIO_TYPE_DISCARD);
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "aio_discard: ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", len=" << len
                  << dendl;
 
+  if (native_async && m_image_ctx.event_socket.is_valid()) {
+    c->set_event_notify(true);
+  }
+
+  if (!start_in_flight_op(c)) {
+    return;
+  }
+
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || is_journal_required() ||
       writes_blocked()) {
     queue(new AioImageDiscard(m_image_ctx, c, off, len));
   } else {
     AioImageRequest::aio_discard(&m_image_ctx, c, off, len);
+    finish_in_flight_op();
   }
 }
 
-void AioImageRequestWQ::aio_flush(AioCompletion *c) {
+void AioImageRequestWQ::aio_flush(AioCompletion *c, bool native_async) {
   c->init_time(&m_image_ctx, librbd::AIO_TYPE_FLUSH);
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "aio_flush: ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << dendl;
 
+  if (native_async && m_image_ctx.event_socket.is_valid()) {
+    c->set_event_notify(true);
+  }
+
+  if (!start_in_flight_op(c)) {
+    return;
+  }
+
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || is_journal_required() ||
       writes_blocked() || !writes_empty()) {
     queue(new AioImageFlush(m_image_ctx, c));
   } else {
     AioImageRequest::aio_flush(&m_image_ctx, c);
+    finish_in_flight_op();
+  }
+}
+
+void AioImageRequestWQ::shut_down(Context *on_shutdown) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  {
+    RWLock::WLocker locker(m_lock);
+    assert(!m_shutdown);
+    m_shutdown = true;
+
+    CephContext *cct = m_image_ctx.cct;
+    ldout(cct, 5) << __func__ << ": in_flight=" << m_in_flight_ops.read()
+                  << dendl;
+    if (m_in_flight_ops.read() > 0) {
+      m_on_shutdown = on_shutdown;
+      return;
+    }
   }
+
+  // ensure that all in-flight IO is flushed
+  m_image_ctx.flush(on_shutdown);
 }
 
 void AioImageRequestWQ::block_writes() {
+  C_SaferCond cond_ctx;
+  block_writes(&cond_ctx);
+  cond_ctx.wait();
+}
+
+void AioImageRequestWQ::block_writes(Context *on_blocked) {
+  assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
-  Mutex::Locker locker(m_lock);
-  ++m_write_blockers;
-  ldout(cct, 5) << __func__ << ": " << &m_image_ctx << ", "
-                << "num=" << m_write_blockers << dendl;
-  if (m_write_blockers == 1) {
-    while (m_in_progress_writes > 0) {
-      m_cond.Wait(m_lock);
+  {
+    RWLock::WLocker locker(m_lock);
+    ++m_write_blockers;
+    ldout(cct, 5) << __func__ << ": " << &m_image_ctx << ", "
+                  << "num=" << m_write_blockers << dendl;
+    if (!m_write_blocker_contexts.empty() || m_in_progress_writes.read() > 0) {
+      m_write_blocker_contexts.push_back(on_blocked);
+      return;
     }
   }
+
+  // ensure that all in-flight IO is flushed
+  m_image_ctx.flush(on_blocked);
 }
 
 void AioImageRequestWQ::unblock_writes() {
@@ -170,7 +245,7 @@ void AioImageRequestWQ::unblock_writes() {
 
   bool wake_up = false;
   {
-    Mutex::Locker locker(m_lock);
+    RWLock::WLocker locker(m_lock);
     assert(m_write_blockers > 0);
     --m_write_blockers;
 
@@ -186,29 +261,34 @@ void AioImageRequestWQ::unblock_writes() {
   }
 }
 
-void AioImageRequestWQ::register_lock_listener() {
-  m_image_ctx.image_watcher->register_listener(&m_lock_listener);
-}
-
 void *AioImageRequestWQ::_void_dequeue() {
   AioImageRequest *peek_item = front();
-  if (peek_item == NULL) {
+  if (peek_item == NULL || m_refresh_in_progress) {
     return NULL;
   }
 
-  {
-    if (peek_item->is_write_op()) {
-      Mutex::Locker locker(m_lock);
-      if (m_write_blockers > 0) {
-        return NULL;
-      }
-      ++m_in_progress_writes;
+  if (peek_item->is_write_op()) {
+    RWLock::RLocker locker(m_lock);
+    if (m_write_blockers > 0) {
+      return NULL;
     }
+    m_in_progress_writes.inc();
   }
 
   AioImageRequest *item = reinterpret_cast<AioImageRequest *>(
     ThreadPool::PointerWQ<AioImageRequest>::_void_dequeue());
   assert(peek_item == item);
+
+  if (m_image_ctx.state->is_refresh_required()) {
+    ldout(m_image_ctx.cct, 15) << "image refresh required: delaying IO " << item
+                               << dendl;
+    m_refresh_in_progress = true;
+
+    get_pool_lock().Unlock();
+    m_image_ctx.state->refresh(new C_RefreshFinish(this, item));
+    get_pool_lock().Lock();
+    return NULL;
+  }
   return item;
 }
 
@@ -222,34 +302,75 @@ void AioImageRequestWQ::process(AioImageRequest *req) {
     req->send();
   }
 
+  bool writes_blocked = false;
   {
-    Mutex::Locker locker(m_lock);
+    RWLock::RLocker locker(m_lock);
     if (req->is_write_op()) {
-      assert(m_queued_writes > 0);
-      --m_queued_writes;
+      assert(m_queued_writes.read() > 0);
+      m_queued_writes.dec();
 
-      assert(m_in_progress_writes > 0);
-      if (--m_in_progress_writes == 0) {
-        m_cond.Signal();
+      assert(m_in_progress_writes.read() > 0);
+      if (m_in_progress_writes.dec() == 0 &&
+          !m_write_blocker_contexts.empty()) {
+        writes_blocked = true;
       }
     }
   }
+
+  if (writes_blocked) {
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    m_image_ctx.flush(new C_BlockedWrites(this));
+  }
   delete req;
+
+  finish_in_flight_op();
+}
+
+int AioImageRequestWQ::start_in_flight_op(AioCompletion *c) {
+  RWLock::RLocker locker(m_lock);
+
+  if (m_shutdown) {
+    CephContext *cct = m_image_ctx.cct;
+    lderr(cct) << "IO received on closed image" << dendl;
+
+    c->get();
+    c->fail(cct, -ESHUTDOWN);
+    return false;
+  }
+
+  m_in_flight_ops.inc();
+  return true;
+}
+
+void AioImageRequestWQ::finish_in_flight_op() {
+  {
+    RWLock::RLocker locker(m_lock);
+    if (m_in_flight_ops.dec() > 0 || !m_shutdown) {
+      return;
+    }
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << __func__ << ": completing shut down" << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  assert(m_on_shutdown != nullptr);
+  m_image_ctx.flush(m_on_shutdown);
 }
 
 bool AioImageRequestWQ::is_journal_required() const {
+  // TODO eliminate once journal startup state is integrated
   RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
   return (m_image_ctx.journal != NULL);
 }
 
 bool AioImageRequestWQ::is_lock_required() const {
   assert(m_image_ctx.owner_lock.is_locked());
-  if (m_image_ctx.image_watcher == NULL) {
+  if (m_image_ctx.exclusive_lock == NULL) {
     return false;
   }
 
-  return (m_image_ctx.image_watcher->is_lock_supported() &&
-          !m_image_ctx.image_watcher->is_lock_owner());
+  return (!m_image_ctx.exclusive_lock->is_lock_owner());
 }
 
 void AioImageRequestWQ::queue(AioImageRequest *req) {
@@ -258,45 +379,42 @@ void AioImageRequestWQ::queue(AioImageRequest *req) {
                  << "req=" << req << dendl;
 
   assert(m_image_ctx.owner_lock.is_locked());
-
-  bool first_write_op = false;
-  {
-    Mutex::Locker locker(m_lock);
-    if (req->is_write_op()) {
-      if (++m_queued_writes == 1) {
-        first_write_op = true;
-      }
-    }
+  bool write_op = req->is_write_op();
+  if (write_op) {
+    m_queued_writes.inc();
   }
+
   ThreadPool::PointerWQ<AioImageRequest>::queue(req);
 
-  if (is_lock_required() && first_write_op) {
-    m_image_ctx.image_watcher->request_lock();
+  if (write_op && is_lock_required()) {
+    m_image_ctx.exclusive_lock->request_lock(nullptr);
   }
 }
 
-void AioImageRequestWQ::handle_lock_updated(
-    ImageWatcher::LockUpdateState state) {
-  assert(m_image_ctx.owner_lock.is_locked());
-
+void AioImageRequestWQ::handle_refreshed(int r, AioImageRequest *req) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << __func__ << ": ictx=" << &m_image_ctx << ", "
-                 << "state=" << state << dendl;
-
-  if ((state == ImageWatcher::LOCK_UPDATE_STATE_NOT_SUPPORTED ||
-       state == ImageWatcher::LOCK_UPDATE_STATE_LOCKED) && m_blocking_writes) {
-    m_blocking_writes = false;
-    unblock_writes();
-  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_RELEASING &&
-             !m_blocking_writes) {
-    m_blocking_writes = true;
-    block_writes();
-  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_UNLOCKED) {
-    assert(m_blocking_writes);
-    assert(writes_blocked());
-  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_NOTIFICATION &&
-             !writes_empty()) {
-    m_image_ctx.image_watcher->request_lock();
+  ldout(cct, 15) << "resuming IO after image refresh: r=" << r << ", "
+                 << "req=" << req << dendl;
+  if (r < 0) {
+    req->fail(r);
+  } else {
+    process(req);
+    process_finish();
+
+    m_refresh_in_progress = false;
+    signal();
+  }
+}
+
+void AioImageRequestWQ::handle_blocked_writes(int r) {
+  Contexts contexts;
+  {
+    RWLock::WLocker locker(m_lock);
+    contexts.swap(m_write_blocker_contexts);
+  }
+
+  for (auto ctx : contexts) {
+    ctx->complete(0);
   }
 }
 
diff --git a/src/librbd/AioImageRequestWQ.h b/src/librbd/AioImageRequestWQ.h
index 20169f5..c845489 100644
--- a/src/librbd/AioImageRequestWQ.h
+++ b/src/librbd/AioImageRequestWQ.h
@@ -5,9 +5,11 @@
 #define CEPH_LIBRBD_AIO_IMAGE_REQUEST_WQ_H
 
 #include "include/Context.h"
+#include "include/atomic.h"
+#include "common/Cond.h"
+#include "common/RWLock.h"
 #include "common/WorkQueue.h"
-#include "common/Mutex.h"
-#include "librbd/ImageWatcher.h"
+#include <list>
 
 namespace librbd {
 
@@ -20,69 +22,89 @@ public:
   AioImageRequestWQ(ImageCtx *image_ctx, const string &name, time_t ti,
                     ThreadPool *tp);
 
-  ssize_t read(uint64_t off, size_t len, char *buf, int op_flags);
-  ssize_t write(uint64_t off, size_t len, const char *buf, int op_flags);
+  ssize_t read(uint64_t off, uint64_t len, char *buf, int op_flags);
+  ssize_t write(uint64_t off, uint64_t len, const char *buf, int op_flags);
   int discard(uint64_t off, uint64_t len);
 
-  void aio_read(AioCompletion *c, uint64_t off, size_t len, char *buf,
-                bufferlist *pbl, int op_flags);
-  void aio_write(AioCompletion *c, uint64_t off, size_t len, const char *buf,
-                 int op_flags);
-  void aio_discard(AioCompletion *c, uint64_t off, uint64_t len);
-  void aio_flush(AioCompletion *c);
+  void aio_read(AioCompletion *c, uint64_t off, uint64_t len, char *buf,
+                bufferlist *pbl, int op_flags, bool native_async=true);
+  void aio_write(AioCompletion *c, uint64_t off, uint64_t len, const char *buf,
+                 int op_flags, bool native_async=true);
+  void aio_discard(AioCompletion *c, uint64_t off, uint64_t len, bool native_async=true);
+  void aio_flush(AioCompletion *c, bool native_async=true);
 
   using ThreadPool::PointerWQ<AioImageRequest>::drain;
   using ThreadPool::PointerWQ<AioImageRequest>::empty;
 
   inline bool writes_empty() const {
-    Mutex::Locker locker(m_lock);
-    return (m_queued_writes == 0);
+    RWLock::RLocker locker(m_lock);
+    return (m_queued_writes.read() == 0);
   }
 
   inline bool writes_blocked() const {
-    Mutex::Locker locker(m_lock);
+    RWLock::RLocker locker(m_lock);
     return (m_write_blockers > 0);
   }
 
+  void shut_down(Context *on_shutdown);
+
   void block_writes();
+  void block_writes(Context *on_blocked);
   void unblock_writes();
 
-  void register_lock_listener();
-
 protected:
   virtual void *_void_dequeue();
   virtual void process(AioImageRequest *req);
 
 private:
-  struct LockListener : public ImageWatcher::Listener {
+  typedef std::list<Context *> Contexts;
+
+  struct C_RefreshFinish : public Context {
     AioImageRequestWQ *aio_work_queue;
-    LockListener(AioImageRequestWQ *_aio_work_queue)
-      : aio_work_queue(_aio_work_queue) {
+    AioImageRequest *aio_image_request;
+
+    C_RefreshFinish(AioImageRequestWQ *aio_work_queue,
+                    AioImageRequest *aio_image_request)
+      : aio_work_queue(aio_work_queue), aio_image_request(aio_image_request) {
+    }
+    virtual void finish(int r) override {
+      aio_work_queue->handle_refreshed(r, aio_image_request);
     }
+  };
 
-    virtual bool handle_requested_lock() {
-      return true;
+  struct C_BlockedWrites : public Context {
+    AioImageRequestWQ *aio_work_queue;
+    C_BlockedWrites(AioImageRequestWQ *_aio_work_queue)
+      : aio_work_queue(_aio_work_queue) {
     }
-    virtual void handle_lock_updated(ImageWatcher::LockUpdateState state) {
-      aio_work_queue->handle_lock_updated(state);
+
+    virtual void finish(int r) {
+      aio_work_queue->handle_blocked_writes(r);
     }
   };
 
   ImageCtx &m_image_ctx;
-  mutable Mutex m_lock;
-  Cond m_cond;
+  mutable RWLock m_lock;
+  Contexts m_write_blocker_contexts;
   uint32_t m_write_blockers;
-  uint32_t m_in_progress_writes;
-  uint32_t m_queued_writes;
+  atomic_t m_in_progress_writes;
+  atomic_t m_queued_writes;
+  atomic_t m_in_flight_ops;
+
+  bool m_refresh_in_progress;
+
+  bool m_shutdown;
+  Context *m_on_shutdown;
 
-  LockListener m_lock_listener;
-  bool m_blocking_writes;
+  int start_in_flight_op(AioCompletion *c);
+  void finish_in_flight_op();
 
   bool is_journal_required() const;
   bool is_lock_required() const;
   void queue(AioImageRequest *req);
 
-  void handle_lock_updated(ImageWatcher::LockUpdateState state);
+  void handle_refreshed(int r, AioImageRequest *req);
+  void handle_blocked_writes(int r);
 };
 
 } // namespace librbd
diff --git a/src/librbd/AioObjectRequest.cc b/src/librbd/AioObjectRequest.cc
index 48e9837..34a1c11 100644
--- a/src/librbd/AioObjectRequest.cc
+++ b/src/librbd/AioObjectRequest.cc
@@ -6,15 +6,18 @@
 #include "common/errno.h"
 #include "common/Mutex.h"
 #include "common/RWLock.h"
+#include "common/WorkQueue.h"
 
+#include "librbd/AioObjectRequest.h"
 #include "librbd/AioCompletion.h"
 #include "librbd/AioImageRequest.h"
+#include "librbd/CopyupRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
-
-#include "librbd/AioObjectRequest.h"
-#include "librbd/CopyupRequest.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
 
 #include <boost/bind.hpp>
 #include <boost/optional.hpp>
@@ -81,9 +84,12 @@ namespace librbd {
   }
 
   static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
+    assert(ictx->owner_lock.is_locked());
     assert(ictx->snap_lock.is_locked());
-    return (ictx->clone_copy_on_read) &&
-           (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
+    return (ictx->clone_copy_on_read &&
+            !ictx->read_only && snap_id == CEPH_NOSNAP &&
+            (ictx->exclusive_lock == nullptr ||
+             ictx->exclusive_lock->is_lock_owner()));
   }
 
   /** read **/
@@ -102,14 +108,6 @@ namespace librbd {
     guard_read();
   }
 
-  AioObjectRead::~AioObjectRead()
-  {
-    if (m_parent_completion) {
-      m_parent_completion->release();
-      m_parent_completion = NULL;
-    }
-  }
-
   void AioObjectRead::guard_read()
   {
     RWLock::RLocker snap_locker(m_ictx->snap_lock);
@@ -137,6 +135,7 @@ namespace librbd {
       // This is the step to read from parent
       if (!m_tried_parent && r == -ENOENT) {
         {
+          RWLock::RLocker owner_locker(m_ictx->owner_lock);
           RWLock::RLocker snap_locker(m_ictx->snap_lock);
           RWLock::RLocker parent_locker(m_ictx->parent_lock);
           if (m_ictx->parent == NULL) {
@@ -211,15 +210,18 @@ namespace librbd {
     ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
                            << m_object_off << "~" << m_object_len << dendl;
 
-    // send read request to parent if the object doesn't exist locally
-    if (!m_ictx->object_map.object_may_exist(m_object_no)) {
-      complete(-ENOENT);
-      return;
+    {
+      RWLock::RLocker snap_locker(m_ictx->snap_lock);
+
+      // send read request to parent if the object doesn't exist locally
+      if (m_ictx->object_map != nullptr &&
+          !m_ictx->object_map->object_may_exist(m_object_no)) {
+        m_ictx->op_work_queue->queue(util::create_context_callback<
+          AioObjectRequest>(this), -ENOENT);
+        return;
+      }
     }
 
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
-    int r;
     librados::ObjectReadOperation op;
     int flags = m_ictx->get_read_flags(m_snap_id);
     if (m_sparse) {
@@ -230,7 +232,10 @@ namespace librbd {
     }
     op.set_op_flags2(m_op_flags);
 
-    r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags, NULL);
+    librados::AioCompletion *rados_completion =
+      util::create_rados_ack_callback(this);
+    int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags,
+                                         NULL);
     assert(r == 0);
 
     rados_completion->release();
@@ -239,9 +244,12 @@ namespace librbd {
   void AioObjectRead::send_copyup()
   {
     {
+      RWLock::RLocker owner_locker(m_ictx->owner_lock);
       RWLock::RLocker snap_locker(m_ictx->snap_lock);
       RWLock::RLocker parent_locker(m_ictx->parent_lock);
-      if (!compute_parent_extents()) {
+      if (!compute_parent_extents() ||
+          (m_ictx->exclusive_lock != nullptr &&
+           !m_ictx->exclusive_lock->is_lock_owner())) {
         return;
       }
     }
@@ -254,14 +262,14 @@ namespace librbd {
       CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
     					         m_parent_extents);
       m_ictx->copyup_list[m_object_no] = new_req;
-      new_req->queue_send();
+      new_req->send();
     }
   }
 
   void AioObjectRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
   {
     assert(!m_parent_completion);
-    m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
+    m_parent_completion = AioCompletion::create<AioObjectRequest>(this);
 
     // prevent the parent image from being deleted while this
     // request is still in-progress
@@ -386,15 +394,17 @@ namespace librbd {
   void AbstractAioObjectWrite::send_pre() {
     assert(m_ictx->owner_lock.is_locked());
 
-    m_object_exist = m_ictx->object_map.object_may_exist(m_object_no);
     bool write = false;
     {
       RWLock::RLocker snap_lock(m_ictx->snap_lock);
-      if (!m_ictx->object_map.enabled()) {
+      if (m_ictx->object_map == nullptr) {
+        m_object_exist = true;
         write = true;
       } else {
         // should have been flushed prior to releasing lock
-        assert(m_ictx->image_watcher->is_lock_owner());
+        assert(m_ictx->exclusive_lock->is_lock_owner());
+
+        m_object_exist = m_ictx->object_map->object_may_exist(m_object_no);
 
         ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
           		       << m_object_off << "~" << m_object_len << dendl;
@@ -405,11 +415,10 @@ namespace librbd {
         pre_object_map_update(&new_state);
 
         RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-        if (m_ictx->object_map[m_object_no] != new_state) {
-          FunctionContext *ctx = new FunctionContext(
-            boost::bind(&AioObjectRequest::complete, this, _1));
-          bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
-                                                       current_state, ctx);
+        if ((*m_ictx->object_map)[m_object_no] != new_state) {
+          Context *ctx = util::create_context_callback<AioObjectRequest>(this);
+          bool updated = m_ictx->object_map->aio_update(m_object_no, new_state,
+                                                        current_state, ctx);
           assert(updated);
         } else {
           write = true;
@@ -427,29 +436,28 @@ namespace librbd {
   bool AbstractAioObjectWrite::send_post() {
     RWLock::RLocker owner_locker(m_ictx->owner_lock);
     RWLock::RLocker snap_locker(m_ictx->snap_lock);
-    if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
+    if (m_ictx->object_map == nullptr || !post_object_map_update()) {
       return true;
     }
 
     // should have been flushed prior to releasing lock
-    assert(m_ictx->image_watcher->is_lock_owner());
+    assert(m_ictx->exclusive_lock->is_lock_owner());
 
     ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
 			   << m_object_off << "~" << m_object_len << dendl;
     m_state = LIBRBD_AIO_WRITE_POST;
 
     RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-    uint8_t current_state = m_ictx->object_map[m_object_no];
+    uint8_t current_state = (*m_ictx->object_map)[m_object_no];
     if (current_state != OBJECT_PENDING ||
         current_state == OBJECT_NONEXISTENT) {
       return true;
     }
 
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&AioObjectRequest::complete, this, _1));
-    bool updated = m_ictx->object_map.aio_update(m_object_no,
-                                                 OBJECT_NONEXISTENT,
-				                 OBJECT_PENDING, ctx);
+    Context *ctx = util::create_context_callback<AioObjectRequest>(this);
+    bool updated = m_ictx->object_map->aio_update(m_object_no,
+                                                  OBJECT_NONEXISTENT,
+				                  OBJECT_PENDING, ctx);
     assert(updated);
     return false;
   }
@@ -501,7 +509,7 @@ namespace librbd {
     assert(m_write.size() != 0);
 
     librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
+      util::create_rados_safe_callback(this);
     int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write,
 					 m_snap_seq, m_snaps);
     assert(r == 0);
@@ -527,8 +535,13 @@ namespace librbd {
   }
 
   void AioObjectWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
-    if (m_ictx->enable_alloc_hint && !m_ictx->object_map.object_may_exist(m_object_no))
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    if (m_ictx->enable_alloc_hint &&
+        (m_ictx->object_map == nullptr ||
+         !m_ictx->object_map->object_may_exist(m_object_no))) {
       wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
+    }
+
     if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
       wr->write_full(m_write_data);
     } else {
@@ -543,7 +556,7 @@ namespace librbd {
 			   << m_object_off << "~" << m_object_len
                            << " object exist " << m_object_exist
 			   << " write_full " << write_full << dendl;
-    if (write_full) {
+    if (write_full && !has_parent()) {
       send_write_op(false);
     } else {
       AbstractAioObjectWrite::send_write();
diff --git a/src/librbd/AioObjectRequest.h b/src/librbd/AioObjectRequest.h
index bcbaf6b..7aa06c1 100644
--- a/src/librbd/AioObjectRequest.h
+++ b/src/librbd/AioObjectRequest.h
@@ -11,6 +11,7 @@
 #include "include/buffer.h"
 #include "include/Context.h"
 #include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
 #include "librbd/ObjectMap.h"
 
 namespace librbd {
@@ -63,7 +64,6 @@ namespace librbd {
 	          vector<pair<uint64_t,uint64_t> >& be,
 	          librados::snap_t snap_id, bool sparse,
 	          Context *completion, int op_flags);
-    virtual ~AioObjectRead();
 
     virtual bool should_complete(int r);
     virtual void send();
@@ -110,6 +110,7 @@ namespace librbd {
     read_state_d m_state;
 
     void send_copyup();
+
     void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
   };
 
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
index 59b3a1f..3bf195c 100644
--- a/src/librbd/AsyncObjectThrottle.cc
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -7,6 +7,7 @@
 #include "librbd/AsyncRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
+#include "librbd/Utils.h"
 
 namespace librbd
 {
@@ -16,7 +17,7 @@ AsyncObjectThrottle<T>::AsyncObjectThrottle(
     const AsyncRequest<T>* async_request, T &image_ctx,
     const ContextFactory& context_factory, Context *ctx,
     ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no)
-  : m_lock(unique_lock_name("librbd::AsyncThrottle::m_lock", this)),
+  : m_lock(util::unique_lock_name("librbd::AsyncThrottle::m_lock", this)),
     m_async_request(async_request), m_image_ctx(image_ctx),
     m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx),
     m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0),
@@ -39,16 +40,17 @@ void AsyncObjectThrottle<T>::start_ops(uint64_t max_concurrent) {
     complete = (m_current_ops == 0);
   }
   if (complete) {
-    m_ctx->complete(m_ret);
+    // avoid re-entrant callback
+    m_image_ctx.op_work_queue->queue(m_ctx, m_ret);
     delete this;
   }
 }
 
 template <typename T>
 void AsyncObjectThrottle<T>::finish_op(int r) {
-  assert(m_image_ctx.owner_lock.is_locked());
   bool complete;
   {
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
     Mutex::Locker locker(m_lock);
     --m_current_ops;
     if (r < 0 && r != -ENOENT && m_ret == 0) {
diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h
index a831051..08f0199 100644
--- a/src/librbd/AsyncObjectThrottle.h
+++ b/src/librbd/AsyncObjectThrottle.h
@@ -36,7 +36,6 @@ protected:
   ImageCtxT &m_image_ctx;
 
   virtual void finish(int r) {
-    RWLock::RLocker locker(m_image_ctx.owner_lock);
     m_finisher.finish_op(r);
   }
 
diff --git a/src/librbd/AsyncOperation.cc b/src/librbd/AsyncOperation.cc
index 7cfc2d7..fd315ad 100644
--- a/src/librbd/AsyncOperation.cc
+++ b/src/librbd/AsyncOperation.cc
@@ -46,6 +46,7 @@ void AsyncOperation::start_op(ImageCtx &image_ctx) {
 
 void AsyncOperation::finish_op() {
   ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+
   {
     Mutex::Locker l(m_image_ctx->async_ops_lock);
     xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
@@ -63,9 +64,11 @@ void AsyncOperation::finish_op() {
     }
   }
 
-  C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx,
-                                                 std::move(m_flush_contexts));
-  m_image_ctx->op_work_queue->queue(ctx);
+  if (!m_flush_contexts.empty()) {
+    C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx,
+                                                   std::move(m_flush_contexts));
+    m_image_ctx->op_work_queue->queue(ctx);
+  }
 }
 
 void AsyncOperation::add_flush_context(Context *on_finish) {
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
index b6e41eb..a93eb50 100644
--- a/src/librbd/AsyncRequest.cc
+++ b/src/librbd/AsyncRequest.cc
@@ -3,6 +3,7 @@
 #include "librbd/AsyncRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
+#include "librbd/Utils.h"
 #include "common/WorkQueue.h"
 #include <boost/bind.hpp>
 
@@ -14,15 +15,11 @@ AsyncRequest<T>::AsyncRequest(T &image_ctx, Context *on_finish)
   : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false),
     m_xlist_item(this) {
   assert(m_on_finish != NULL);
-  Mutex::Locker l(m_image_ctx.async_ops_lock);
-  m_image_ctx.async_requests.push_back(&m_xlist_item);
+  start_request();
 }
 
 template <typename T>
 AsyncRequest<T>::~AsyncRequest() {
-  Mutex::Locker l(m_image_ctx.async_ops_lock);
-  assert(m_xlist_item.remove_myself());
-  m_image_ctx.async_requests_cond.Signal();
 }
 
 template <typename T>
@@ -32,19 +29,41 @@ void AsyncRequest<T>::async_complete(int r) {
 
 template <typename T>
 librados::AioCompletion *AsyncRequest<T>::create_callback_completion() {
-  return librados::Rados::aio_create_completion(create_callback_context(),
-						NULL, rados_ctx_cb);
+  return util::create_rados_safe_callback(this);
 }
 
 template <typename T>
 Context *AsyncRequest<T>::create_callback_context() {
-  return new FunctionContext(boost::bind(&AsyncRequest<T>::complete, this, _1));
+  return util::create_context_callback(this);
 }
 
 template <typename T>
 Context *AsyncRequest<T>::create_async_callback_context() {
-  return new FunctionContext(boost::bind(&AsyncRequest<T>::async_complete, this,
-                                         _1));;
+  return util::create_context_callback<AsyncRequest<T>,
+                                       &AsyncRequest<T>::async_complete>(this);
+}
+
+template <typename T>
+void AsyncRequest<T>::start_request() {
+  Mutex::Locker async_ops_locker(m_image_ctx.async_ops_lock);
+  m_image_ctx.async_requests.push_back(&m_xlist_item);
+}
+
+template <typename T>
+void AsyncRequest<T>::finish_request() {
+  decltype(m_image_ctx.async_requests_waiters) waiters;
+  {
+    Mutex::Locker async_ops_locker(m_image_ctx.async_ops_lock);
+    assert(m_xlist_item.remove_myself());
+
+    if (m_image_ctx.async_requests.empty()) {
+      waiters = std::move(m_image_ctx.async_requests_waiters);
+    }
+  }
+
+  for (auto ctx : waiters) {
+    ctx->complete(0);
+  }
 }
 
 } // namespace librbd
diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h
index 241c90e..96802f8 100644
--- a/src/librbd/AsyncRequest.h
+++ b/src/librbd/AsyncRequest.h
@@ -21,11 +21,11 @@ public:
   virtual ~AsyncRequest();
 
   void complete(int r) {
-    if (m_canceled && safely_cancel(r)) {
-      m_on_finish->complete(-ERESTART);
-      delete this;
-    } else if (should_complete(r)) {
-      m_on_finish->complete(filter_return_code(r));
+    if (should_complete(r)) {
+      r = filter_return_code(r);
+      finish(r);
+      finish_request();
+      m_on_finish->complete(r);
       delete this;
     }
   }
@@ -49,16 +49,19 @@ protected:
 
   void async_complete(int r);
 
-  virtual bool safely_cancel(int r) {
-    return true;
-  }
   virtual bool should_complete(int r) = 0;
-  virtual int filter_return_code(int r) {
+  virtual int filter_return_code(int r) const {
     return r;
   }
+
+  virtual void finish(int r) {
+  }
 private:
   bool m_canceled;
   typename xlist<AsyncRequest<ImageCtxT> *>::item m_xlist_item;
+
+  void start_request();
+  void finish_request();
 };
 
 } // namespace librbd
diff --git a/src/librbd/AsyncResizeRequest.cc b/src/librbd/AsyncResizeRequest.cc
deleted file mode 100644
index 9982492..0000000
--- a/src/librbd/AsyncResizeRequest.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#include "librbd/AsyncResizeRequest.h"
-#include "librbd/AsyncTrimRequest.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/ImageWatcher.h"
-#include "librbd/internal.h"
-#include "librbd/ObjectMap.h"
-#include "common/dout.h"
-#include "common/errno.h"
-
-#define dout_subsys ceph_subsys_rbd
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::AsyncResizeRequest: "
-
-namespace librbd
-{
-
-AsyncResizeRequest::AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish,
-                                       uint64_t new_size,
-                                       ProgressContext &prog_ctx)
-  : AsyncRequest(image_ctx, on_finish),
-    m_original_size(0), m_new_size(new_size),
-    m_prog_ctx(prog_ctx), m_new_parent_overlap(0),
-    m_xlist_item(this)
-{
-}
-
-AsyncResizeRequest::~AsyncResizeRequest() {
-  AsyncResizeRequest *next_req = NULL;
-  {
-    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
-    assert(m_xlist_item.remove_myself());
-    if (!m_image_ctx.async_resize_reqs.empty()) {
-      next_req = m_image_ctx.async_resize_reqs.front();
-    }
-  }
-
-  if (next_req != NULL) {
-    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-    next_req->send();
-  }
-}
-
-bool AsyncResizeRequest::safely_cancel(int r) {
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 5) << this << " safely_cancel: " << " r=" << r << dendl;
-
-  // avoid interrupting the object map / header updates
-  switch (m_state) {
-  case STATE_GROW_OBJECT_MAP:
-  case STATE_UPDATE_HEADER:
-  case STATE_SHRINK_OBJECT_MAP:
-    ldout(cct, 5) << "delaying cancel request" << dendl;
-    return false;
-  default:
-    break;
-  }
-  return true;
-}
-
-bool AsyncResizeRequest::should_complete(int r) {
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
-
-  if (r < 0) {
-    lderr(cct) << "resize encountered an error: " << cpp_strerror(r) << dendl;
-    return true;
-  }
-  if (m_state == STATE_FINISHED) {
-    ldout(cct, 5) << "FINISHED" << dendl;
-    return true;
-  }
-
-  RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
-  switch (m_state) {
-  case STATE_FLUSH:
-    ldout(cct, 5) << "FLUSH" << dendl;
-    send_invalidate_cache();
-    break;
-
-  case STATE_INVALIDATE_CACHE:
-    ldout(cct, 5) << "INVALIDATE_CACHE" << dendl;
-    send_trim_image();
-    break;
-
-  case STATE_TRIM_IMAGE:
-    ldout(cct, 5) << "TRIM_IMAGE" << dendl;
-    send_update_header();
-    break;
-
-  case STATE_GROW_OBJECT_MAP:
-    ldout(cct, 5) << "GROW_OBJECT_MAP" << dendl;
-    send_update_header();
-    break;
-
-  case STATE_UPDATE_HEADER:
-    ldout(cct, 5) << "UPDATE_HEADER" << dendl;
-    if (send_shrink_object_map()) {
-      update_size_and_overlap();
-      return true;
-    }
-    break;
-
-  case STATE_SHRINK_OBJECT_MAP:
-    ldout(cct, 5) << "SHRINK_OBJECT_MAP" << dendl;
-    update_size_and_overlap();
-    return true;
-
-  default:
-    lderr(cct) << "invalid state: " << m_state << dendl;
-    assert(false);
-    break;
-  }
-  return false;
-}
-
-void AsyncResizeRequest::send() {
-  assert(m_image_ctx.owner_lock.is_locked());
-
-  {
-    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
-    if (!m_xlist_item.is_on_list()) {
-      m_image_ctx.async_resize_reqs.push_back(&m_xlist_item);
-      if (m_image_ctx.async_resize_reqs.front() != this) {
-        return;
-      }
-    }
-
-    assert(m_image_ctx.async_resize_reqs.front() == this);
-    m_original_size = m_image_ctx.size;
-    compute_parent_overlap();
-  }
-
-  CephContext *cct = m_image_ctx.cct;
-  if (is_canceled()) {
-    complete(-ERESTART);
-  } else if (m_original_size == m_new_size) {
-    ldout(cct, 2) << this << " no change in size (" << m_original_size
-		  << " -> " << m_new_size << ")" << dendl;
-    m_state = STATE_FINISHED;
-    complete(0);
-  } else if (m_new_size > m_original_size) {
-    ldout(cct, 2) << this << " expanding image (" << m_original_size
-		  << " -> " << m_new_size << ")" << dendl;
-    send_grow_object_map();
-  } else {
-    ldout(cct, 2) << this << " shrinking image (" << m_original_size
-		  << " -> " << m_new_size << ")" << dendl;
-    send_flush();
-  }
-}
-
-void AsyncResizeRequest::send_flush() {
-  ldout(m_image_ctx.cct, 5) << this << " send_flush: "
-                            << " original_size=" << m_original_size
-                            << " new_size=" << m_new_size << dendl;
-  m_state = STATE_FLUSH;
-
-  // with clipping adjusted, ensure that write / copy-on-read operations won't
-  // (re-)create objects that we just removed. need async callback to ensure
-  // we don't have cache_lock already held
-  m_image_ctx.flush_async_operations(create_async_callback_context());
-}
-
-void AsyncResizeRequest::send_invalidate_cache() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  ldout(m_image_ctx.cct, 5) << this << " send_invalidate_cache: "
-                            << " original_size=" << m_original_size
-                            << " new_size=" << m_new_size << dendl;
-  m_state = STATE_INVALIDATE_CACHE;
-
-  // need to invalidate since we're deleting objects, and
-  // ObjectCacher doesn't track non-existent objects
-  m_image_ctx.invalidate_cache(create_callback_context());
-}
-
-void AsyncResizeRequest::send_trim_image() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  ldout(m_image_ctx.cct, 5) << this << " send_trim_image: "
-                            << " original_size=" << m_original_size
-                            << " new_size=" << m_new_size << dendl;
-  m_state = STATE_TRIM_IMAGE;
-
-  AsyncTrimRequest *req = new AsyncTrimRequest(m_image_ctx,
-					       create_callback_context(),
-					       m_original_size, m_new_size,
-					       m_prog_ctx);
-  req->send();
-}
-
-void AsyncResizeRequest::send_grow_object_map() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  if (!m_image_ctx.object_map.enabled()) {
-    send_update_header();
-    return;
-  }
-
-  ldout(m_image_ctx.cct, 5) << this << " send_grow_object_map: "
-                            << " original_size=" << m_original_size
-                            << " new_size=" << m_new_size << dendl;
-  m_state = STATE_GROW_OBJECT_MAP;
-
-  // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-
-  m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
-				    create_callback_context());
-}
-
-bool AsyncResizeRequest::send_shrink_object_map() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  if (!m_image_ctx.object_map.enabled() || m_new_size > m_original_size) {
-    return true;
-  }
-
-  ldout(m_image_ctx.cct, 5) << this << " send_shrink_object_map: "
-		            << " original_size=" << m_original_size
-			    << " new_size=" << m_new_size << dendl;
-  m_state = STATE_SHRINK_OBJECT_MAP;
-
-  // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-
-  m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
-				    create_callback_context());
-  return false;
-}
-
-void AsyncResizeRequest::send_update_header() {
-  assert(m_image_ctx.owner_lock.is_locked());
-
-  ldout(m_image_ctx.cct, 5) << this << " send_update_header: "
-                            << " original_size=" << m_original_size
-                            << " new_size=" << m_new_size << dendl;
-  m_state = STATE_UPDATE_HEADER;
-
-  // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-
-  librados::ObjectWriteOperation op;
-  if (m_image_ctx.old_format) {
-    // rewrite only the size field of the header
-    // NOTE: format 1 image headers are not stored in fixed endian format
-    bufferlist bl;
-    bl.append(reinterpret_cast<const char*>(&m_new_size), sizeof(m_new_size));
-    op.write(offsetof(rbd_obj_header_ondisk, image_size), bl);
-  } else {
-    if (m_image_ctx.image_watcher->is_lock_supported()) {
-      m_image_ctx.image_watcher->assert_header_locked(&op);
-    }
-    cls_client::set_size(&op, m_new_size);
-  }
-
-  librados::AioCompletion *rados_completion = create_callback_completion();
-  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
-    				     rados_completion, &op);
-  assert(r == 0);
-  rados_completion->release();
-}
-
-void AsyncResizeRequest::compute_parent_overlap() {
-  RWLock::RLocker l2(m_image_ctx.parent_lock);
-  if (m_image_ctx.parent == NULL) {
-    m_new_parent_overlap = 0;
-  } else {
-    m_new_parent_overlap = MIN(m_new_size, m_image_ctx.parent_md.overlap);
-  }
-}
-
-void AsyncResizeRequest::update_size_and_overlap() {
-  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
-  m_image_ctx.size = m_new_size;
-
-  RWLock::WLocker parent_locker(m_image_ctx.parent_lock);
-  if (m_image_ctx.parent != NULL && m_new_size < m_original_size) {
-    m_image_ctx.parent_md.overlap = m_new_parent_overlap;
-  }
-}
-
-} // namespace librbd
diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc
deleted file mode 100644
index 6159ef5..0000000
--- a/src/librbd/AsyncTrimRequest.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#include "librbd/AsyncTrimRequest.h"
-#include "librbd/AsyncObjectThrottle.h"
-#include "librbd/AioObjectRequest.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/ImageWatcher.h"
-#include "librbd/internal.h"
-#include "librbd/ObjectMap.h"
-#include "common/ContextCompletion.h"
-#include "common/dout.h"
-#include "common/errno.h"
-#include "osdc/Striper.h"
-
-#include <boost/bind.hpp>
-#include <boost/lambda/bind.hpp>
-#include <boost/lambda/construct.hpp>
-#include <boost/scope_exit.hpp>
-
-#define dout_subsys ceph_subsys_rbd
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::AsyncTrimRequest: "
-
-namespace librbd
-{
-
-class C_CopyupObject : public C_AsyncObjectThrottle<> {
-public:
-  C_CopyupObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
-                 ::SnapContext snapc, uint64_t object_no)
-    : C_AsyncObjectThrottle(throttle, *image_ctx), m_snapc(snapc),
-      m_object_no(object_no)
-  {
-  }
-
-  virtual int send() {
-    assert(m_image_ctx.owner_lock.is_locked());
-    assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-           m_image_ctx.image_watcher->is_lock_owner());
-
-    string oid = m_image_ctx.get_object_name(m_object_no);
-    ldout(m_image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
-
-    AioObjectRequest *req = new AioObjectTrim(&m_image_ctx, oid, m_object_no,
-                                              m_snapc, this);
-    req->send();
-    return 0;
-  }
-private:
-  ::SnapContext m_snapc;
-  uint64_t m_object_no;
-};
-
-class C_RemoveObject : public C_AsyncObjectThrottle<> {
-public:
-  C_RemoveObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
-                 uint64_t object_no)
-    : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no)
-  {
-  }
-
-  virtual int send() {
-    assert(m_image_ctx.owner_lock.is_locked());
-    assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-           m_image_ctx.image_watcher->is_lock_owner());
-    if (!m_image_ctx.object_map.object_may_exist(m_object_no)) {
-      return 1;
-    }
-
-    string oid = m_image_ctx.get_object_name(m_object_no);
-    ldout(m_image_ctx.cct, 10) << "removing " << oid << dendl;
-
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb);
-    int r = m_image_ctx.data_ctx.aio_remove(oid, rados_completion);
-    assert(r == 0);
-    rados_completion->release();
-    return 0;
-  }
-
-private:
-  uint64_t m_object_no;
-};
-
-AsyncTrimRequest::AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish,
-				   uint64_t original_size, uint64_t new_size,
-				   ProgressContext &prog_ctx)
-  : AsyncRequest(image_ctx, on_finish), m_new_size(new_size),
-    m_prog_ctx(prog_ctx)
-{
-  uint64_t period = m_image_ctx.get_stripe_period();
-  uint64_t new_num_periods = ((m_new_size + period - 1) / period);
-  m_delete_off = MIN(new_num_periods * period, original_size);
-  // first object we can delete free and clear
-  m_delete_start = new_num_periods * m_image_ctx.get_stripe_count();
-  m_num_objects = Striper::get_num_objects(m_image_ctx.layout, original_size);
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 10) << this << " trim image " << original_size << " -> "
-		 << m_new_size << " periods " << new_num_periods
-                 << " discard to offset " << m_delete_off
-                 << " delete objects " << m_delete_start
-                 << " to " << m_num_objects << dendl;
-}
-
-
-bool AsyncTrimRequest::should_complete(int r)
-{
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 5) << this << " should_complete: r=" << r << dendl;
-  if (r < 0) {
-    lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl;
-    return true;
-  }
-
-  switch (m_state) {
-  case STATE_COPYUP_OBJECTS:
-    ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
-    send_pre_remove();
-    break;
-
-  case STATE_PRE_REMOVE:
-    ldout(cct, 5) << " PRE_REMOVE" << dendl;
-    {
-      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
-      send_remove_objects();
-    }
-    break;
-
-  case STATE_REMOVE_OBJECTS:
-    ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
-    send_post_remove();
-    break;
-
-  case STATE_POST_REMOVE:
-    ldout(cct, 5) << " POST_OBJECTS" << dendl;
-    {
-      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
-      send_clean_boundary();
-    }
-    break;
-
-  case STATE_CLEAN_BOUNDARY:
-    ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
-    finish(0);
-    break;
-
-  case STATE_FINISHED:
-    ldout(cct, 5) << "FINISHED" << dendl;
-    return true;
-
-  default:
-    lderr(cct) << "invalid state: " << m_state << dendl;
-    assert(false);
-    break;
-  }
-  return false;
-}
-
-void AsyncTrimRequest::send() {
-  send_copyup_objects();
-}
-
-void AsyncTrimRequest::send_copyup_objects() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-
-  if (m_delete_start >= m_num_objects) {
-    send_clean_boundary();
-    return;
-  }
-
-  ::SnapContext snapc;
-  bool has_snapshots;
-  uint64_t parent_overlap;
-  {
-    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-    RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
-
-    snapc = m_image_ctx.snapc;
-    has_snapshots = !m_image_ctx.snaps.empty();
-    int r = m_image_ctx.get_parent_overlap(m_image_ctx.get_copyup_snap_id(),
-                                           &parent_overlap);
-    assert(r == 0);
-  }
-
-  // copyup is only required for portion of image that overlaps parent
-  uint64_t copyup_end = Striper::get_num_objects(m_image_ctx.layout,
-                                                 parent_overlap);
-  // TODO: protect against concurrent shrink and snap create?
-  if (copyup_end <= m_delete_start || !has_snapshots) {
-    send_pre_remove();
-    return;
-  }
-
-  uint64_t copyup_start = m_delete_start;
-  m_delete_start = copyup_end;
-
-  ldout(m_image_ctx.cct, 5) << this << " send_copyup_objects: "
-			    << " start object=" << copyup_start << ", "
-			    << " end object=" << copyup_end << dendl;
-  m_state = STATE_COPYUP_OBJECTS;
-
-  Context *ctx = create_callback_context();
-  AsyncObjectThrottle<>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject>(),
-      boost::lambda::_1, &m_image_ctx, snapc, boost::lambda::_2));
-  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
-    this, m_image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
-    copyup_end);
-  throttle->start_ops(m_image_ctx.concurrent_management_ops);
-}
-
-void AsyncTrimRequest::send_remove_objects() {
-  assert(m_image_ctx.owner_lock.is_locked());
-
-  ldout(m_image_ctx.cct, 5) << this << " send_remove_objects: "
-			    << " delete_start=" << m_delete_start
-			    << " num_objects=" << m_num_objects << dendl;
-  m_state = STATE_REMOVE_OBJECTS;
-
-  Context *ctx = create_callback_context();
-  AsyncObjectThrottle<>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject>(),
-      boost::lambda::_1, &m_image_ctx, boost::lambda::_2));
-  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
-    this, m_image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
-    m_num_objects);
-  throttle->start_ops(m_image_ctx.concurrent_management_ops);
-}
-
-void AsyncTrimRequest::send_pre_remove() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  if (m_delete_start >= m_num_objects) {
-    send_clean_boundary();
-    return;
-  }
-
-  bool remove_objects = false;
-  {
-    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-    if (!m_image_ctx.object_map.enabled()) {
-      remove_objects = true;
-    } else {
-      ldout(m_image_ctx.cct, 5) << this << " send_pre_remove: "
-				<< " delete_start=" << m_delete_start
-				<< " num_objects=" << m_num_objects << dendl;
-      m_state = STATE_PRE_REMOVE;
-
-      assert(m_image_ctx.image_watcher->is_lock_owner());
-
-      // flag the objects as pending deletion
-      Context *ctx = create_callback_context();
-      RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
-      if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
-					     OBJECT_PENDING, OBJECT_EXISTS,
-                                             ctx)) {
-        delete ctx;
-        remove_objects = true;
-      }
-    }
-  }
-
-  // avoid possible recursive lock attempts
-  if (remove_objects) {
-    // no object map update required
-    send_remove_objects();
-  }
-}
-
-void AsyncTrimRequest::send_post_remove() {
-  assert(m_image_ctx.owner_lock.is_locked());
-
-  bool clean_boundary = false;
-  {
-    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-    if (!m_image_ctx.object_map.enabled()) {
-      clean_boundary = true;
-    } else {
-      ldout(m_image_ctx.cct, 5) << this << " send_post_remove: "
-          		        << " delete_start=" << m_delete_start
-          		        << " num_objects=" << m_num_objects << dendl;
-      m_state = STATE_POST_REMOVE;
-
-      assert(m_image_ctx.image_watcher->is_lock_owner());
-
-      // flag the pending objects as removed
-      Context *ctx = create_callback_context();
-      RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
-      if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
-					     OBJECT_NONEXISTENT,
-					     OBJECT_PENDING, ctx)) {
-        delete ctx;
-	clean_boundary = true;
-      }
-    }
-  }
-
-  // avoid possible recursive lock attempts
-  if (clean_boundary) {
-    // no object map update required
-    send_clean_boundary();
-  }
-}
-
-void AsyncTrimRequest::send_clean_boundary() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  CephContext *cct = m_image_ctx.cct;
-  if (m_delete_off <= m_new_size) {
-    finish(0);
-    return;
-  }
-
-  // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-  uint64_t delete_len = m_delete_off - m_new_size;
-  ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: "
-			    << " delete_off=" << m_delete_off
-			    << " length=" << delete_len << dendl;
-  m_state = STATE_CLEAN_BOUNDARY;
-
-  ::SnapContext snapc;
-  {
-    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-    snapc = m_image_ctx.snapc;
-  }
-
-  // discard the weird boundary
-  std::vector<ObjectExtent> extents;
-  Striper::file_to_extents(cct, m_image_ctx.format_string,
-			   &m_image_ctx.layout, m_new_size, delete_len, 0,
-                           extents);
-
-  ContextCompletion *completion =
-    new ContextCompletion(create_callback_context(), true);
-  for (vector<ObjectExtent>::iterator p = extents.begin();
-       p != extents.end(); ++p) {
-    ldout(cct, 20) << " ex " << *p << dendl;
-    Context *req_comp = new C_ContextCompletion(*completion);
-
-    AioObjectRequest *req;
-    if (p->offset == 0) {
-      req = new AioObjectTrim(&m_image_ctx, p->oid.name, p->objectno, snapc,
-                              req_comp);
-    } else {
-      req = new AioObjectTruncate(&m_image_ctx, p->oid.name, p->objectno,
-                                  p->offset, snapc, req_comp);
-    }
-    req->send();
-  }
-  completion->finish_adding_requests();
-}
-
-void AsyncTrimRequest::finish(int r) {
-  m_state = STATE_FINISHED;
-  async_complete(r);
-}
-
-} // namespace librbd
diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc
index 5c3973a..d7713ad 100644
--- a/src/librbd/CopyupRequest.cc
+++ b/src/librbd/CopyupRequest.cc
@@ -11,10 +11,12 @@
 #include "librbd/AioObjectRequest.h"
 #include "librbd/AsyncObjectThrottle.h"
 #include "librbd/CopyupRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
 
 #include <boost/bind.hpp>
 #include <boost/lambda/bind.hpp>
@@ -44,10 +46,11 @@ public:
     if (snap_id == CEPH_NOSNAP) {
       RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
       RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
-      assert(m_image_ctx.image_watcher->is_lock_owner());
-      bool sent = m_image_ctx.object_map.aio_update(m_object_no, OBJECT_EXISTS,
-                                                    boost::optional<uint8_t>(),
-                                                    this);
+      assert(m_image_ctx.exclusive_lock->is_lock_owner());
+      assert(m_image_ctx.object_map != nullptr);
+      bool sent = m_image_ctx.object_map->aio_update(m_object_no, OBJECT_EXISTS,
+                                                     boost::optional<uint8_t>(),
+                                                     this);
       return (sent ? 0 : 1);
     }
 
@@ -57,9 +60,14 @@ public:
       state = OBJECT_EXISTS_CLEAN;
     }
 
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
     RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock);
-    m_image_ctx.object_map.aio_update(snap_id, m_object_no, m_object_no + 1,
-                                      state, boost::optional<uint8_t>(), this);
+    if (m_image_ctx.object_map == nullptr) {
+      return 1;
+    }
+
+    m_image_ctx.object_map->aio_update(snap_id, m_object_no, m_object_no + 1,
+                                       state, boost::optional<uint8_t>(), this);
     return 0;
   }
 
@@ -141,9 +149,7 @@ private:
 
       ldout(m_ictx->cct, 20) << __func__ << " " << this << " copyup with "
                              << "empty snapshot context" << dendl;
-      librados::AioCompletion *comp =
-        librados::Rados::aio_create_completion(create_callback_context(), NULL,
-                                               rados_ctx_cb);
+      librados::AioCompletion *comp = util::create_rados_safe_callback(this);
       r = m_ictx->md_ctx.aio_operate(m_oid, comp, &copyup_op, 0, snaps);
       assert(r == 0);
       comp->release();
@@ -166,9 +172,7 @@ private:
       assert(write_op.size() != 0);
 
       snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
-      librados::AioCompletion *comp =
-        librados::Rados::aio_create_completion(create_callback_context(), NULL,
-                                               rados_ctx_cb);
+      librados::AioCompletion *comp = util::create_rados_safe_callback(this);
       r = m_ictx->data_ctx.aio_operate(m_oid, comp, &write_op);
       assert(r == 0);
       comp->release();
@@ -179,8 +183,7 @@ private:
   void CopyupRequest::send()
   {
     m_state = STATE_READ_FROM_PARENT;
-    AioCompletion *comp = aio_create_completion_internal(
-      create_callback_context(), rbd_ctx_cb);
+    AioCompletion *comp = AioCompletion::create(this);
 
     ldout(m_ictx->cct, 20) << __func__ << " " << this
                            << ": completion " << comp
@@ -192,18 +195,6 @@ private:
                               &m_copyup_data, 0);
   }
 
-  void CopyupRequest::queue_send()
-  {
-    // TODO: once the ObjectCacher allows reentrant read requests, the finisher
-    // should be eliminated
-    ldout(m_ictx->cct, 20) << __func__ << " " << this
-			   << ": oid " << m_oid << " "
-			   << ", extents " << m_image_extents << dendl;
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&CopyupRequest::send, this));
-    m_ictx->copyup_finisher->queue(ctx);
-  }
-
   void CopyupRequest::complete(int r)
   {
     if (should_complete(r)) {
@@ -272,17 +263,13 @@ private:
     {
       RWLock::RLocker owner_locker(m_ictx->owner_lock);
       RWLock::RLocker snap_locker(m_ictx->snap_lock);
-      if (m_ictx->object_map.enabled()) {
+      if (m_ictx->object_map != nullptr) {
         bool copy_on_read = m_pending_requests.empty();
-        if (!m_ictx->image_watcher->is_lock_owner()) {
-          ldout(m_ictx->cct, 20) << "exclusive lock not held for copyup request"
-                                 << dendl;
-          assert(copy_on_read);
-          return true;
-        }
+        assert(m_ictx->exclusive_lock->is_lock_owner());
 
         RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-        if (copy_on_read && m_ictx->object_map[m_object_no] != OBJECT_EXISTS) {
+        if (copy_on_read &&
+            (*m_ictx->object_map)[m_object_no] != OBJECT_EXISTS) {
           // CoW already updates the HEAD object map
           m_snap_ids.push_back(CEPH_NOSNAP);
         }
@@ -310,15 +297,10 @@ private:
         boost::lambda::_1, m_ictx, m_object_no, &m_snap_ids,
         boost::lambda::_2));
       AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
-        NULL, *m_ictx, context_factory, create_callback_context(), NULL, 0,
-        m_snap_ids.size());
+        NULL, *m_ictx, context_factory, util::create_context_callback(this),
+        NULL, 0, m_snap_ids.size());
       throttle->start_ops(m_ictx->concurrent_management_ops);
     }
     return false;
   }
-
-  Context *CopyupRequest::create_callback_context()
-  {
-    return new FunctionContext(boost::bind(&CopyupRequest::complete, this, _1));
-  }
 }
diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h
index e3a7cdb..4d971d8 100644
--- a/src/librbd/CopyupRequest.h
+++ b/src/librbd/CopyupRequest.h
@@ -23,7 +23,8 @@ namespace librbd {
     void append_request(AioObjectRequest *req);
 
     void send();
-    void queue_send();
+
+    void complete(int r);
 
   private:
     /**
@@ -74,15 +75,12 @@ namespace librbd {
 
     void complete_requests(int r);
 
-    void complete(int r);
     bool should_complete(int r);
 
     void remove_from_list();
 
     bool send_object_map();
     bool send_copyup();
-
-    Context *create_callback_context();
   };
 }
 
diff --git a/src/librbd/DiffIterate.cc b/src/librbd/DiffIterate.cc
index ae52db3..9e4bc17 100644
--- a/src/librbd/DiffIterate.cc
+++ b/src/librbd/DiffIterate.cc
@@ -4,6 +4,8 @@
 #include "librbd/DiffIterate.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
 #include "include/rados/librados.hpp"
 #include "include/interval_set.h"
 #include "common/errno.h"
@@ -61,7 +63,7 @@ public:
   void send() {
     C_OrderedThrottle *ctx = m_diff_context.throttle.start_op(this);
     librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(ctx, NULL, rados_ctx_cb);
+      util::create_rados_safe_callback(ctx);
 
     librados::ObjectReadOperation op;
     op.list_snaps(&m_snap_set, &m_snap_ret);
diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc
new file mode 100644
index 0000000..fd362ef
--- /dev/null
+++ b/src/librbd/ExclusiveLock.cc
@@ -0,0 +1,531 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ExclusiveLock.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/AcquireRequest.h"
+#include "librbd/exclusive_lock/ReleaseRequest.h"
+#include <sstream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ExclusiveLock: "
+
+namespace librbd {
+
+using namespace exclusive_lock;
+
+namespace {
+
+const std::string WATCHER_LOCK_COOKIE_PREFIX = "auto";
+
+template <typename I>
+struct C_SendReleaseRequest : public Context {
+  ReleaseRequest<I>* request;
+  C_SendReleaseRequest(ReleaseRequest<I>* request) : request(request) {
+  }
+  virtual void finish(int r) override {
+    request->send();
+  }
+};
+
+} // anonymous namespace
+
+template <typename I>
+const std::string ExclusiveLock<I>::WATCHER_LOCK_TAG("internal");
+
+template <typename I>
+ExclusiveLock<I>::ExclusiveLock(I &image_ctx)
+  : m_image_ctx(image_ctx),
+    m_lock(util::unique_lock_name("librbd::ExclusiveLock::m_lock", this)),
+    m_state(STATE_UNINITIALIZED), m_watch_handle(0) {
+}
+
+template <typename I>
+ExclusiveLock<I>::~ExclusiveLock() {
+  assert(m_state == STATE_UNINITIALIZED || m_state == STATE_SHUTDOWN);
+}
+
+template <typename I>
+bool ExclusiveLock<I>::is_lock_owner() const {
+  Mutex::Locker locker(m_lock);
+
+  bool lock_owner;
+  switch (m_state) {
+  case STATE_LOCKED:
+  case STATE_POST_ACQUIRING:
+  case STATE_PRE_RELEASING:
+    lock_owner = true;
+    break;
+  default:
+    lock_owner = false;
+    break;
+  }
+
+  ldout(m_image_ctx.cct, 20) << this << " " << __func__ << "=" << lock_owner
+                             << dendl;
+  return lock_owner;
+}
+
+template <typename I>
+bool ExclusiveLock<I>::accept_requests() const {
+  Mutex::Locker locker(m_lock);
+
+  bool accept_requests = (!is_shutdown() && m_state == STATE_LOCKED);
+  ldout(m_image_ctx.cct, 20) << this << " " << __func__ << "="
+                             << accept_requests << dendl;
+  return accept_requests;
+}
+
+template <typename I>
+void ExclusiveLock<I>::init(Context *on_init) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_state == STATE_UNINITIALIZED);
+    m_state = STATE_INITIALIZING;
+  }
+
+  m_image_ctx.aio_work_queue->block_writes(new C_InitComplete(this, on_init));
+}
+
+template <typename I>
+void ExclusiveLock<I>::shut_down(Context *on_shut_down) {
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(!is_shutdown());
+  execute_action(ACTION_SHUT_DOWN, on_shut_down);
+}
+
+template <typename I>
+void ExclusiveLock<I>::try_lock(Context *on_tried_lock) {
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_image_ctx.owner_lock.is_wlocked());
+    assert(!is_shutdown());
+
+    if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) {
+      ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+      execute_action(ACTION_TRY_LOCK, on_tried_lock);
+      return;
+    }
+  }
+
+  on_tried_lock->complete(0);
+}
+
+template <typename I>
+void ExclusiveLock<I>::request_lock(Context *on_locked) {
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_image_ctx.owner_lock.is_locked());
+    assert(!is_shutdown());
+    if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) {
+      ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+      execute_action(ACTION_REQUEST_LOCK, on_locked);
+      return;
+    }
+  }
+
+  if (on_locked != nullptr) {
+    on_locked->complete(0);
+  }
+}
+
+template <typename I>
+void ExclusiveLock<I>::release_lock(Context *on_released) {
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_image_ctx.owner_lock.is_locked());
+    assert(!is_shutdown());
+
+    if (m_state != STATE_UNLOCKED || !m_actions_contexts.empty()) {
+      ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+      execute_action(ACTION_RELEASE_LOCK, on_released);
+      return;
+    }
+  }
+
+  on_released->complete(0);
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_lock_released() {
+  Mutex::Locker locker(m_lock);
+  if (m_state != STATE_WAITING_FOR_PEER) {
+    return;
+  }
+
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+  assert(get_active_action() == ACTION_REQUEST_LOCK);
+  execute_next_action();
+}
+
+template <typename I>
+void ExclusiveLock<I>::assert_header_locked(librados::ObjectWriteOperation *op) {
+  Mutex::Locker locker(m_lock);
+  rados::cls::lock::assert_locked(op, RBD_LOCK_NAME, LOCK_EXCLUSIVE,
+                                  encode_lock_cookie(), WATCHER_LOCK_TAG);
+}
+
+template <typename I>
+std::string ExclusiveLock<I>::encode_lock_cookie() const {
+  assert(m_lock.is_locked());
+
+  assert(m_watch_handle != 0);
+  std::ostringstream ss;
+  ss << WATCHER_LOCK_COOKIE_PREFIX << " " << m_watch_handle;
+  return ss.str();
+}
+
+template <typename I>
+bool ExclusiveLock<I>::decode_lock_cookie(const std::string &tag,
+                                          uint64_t *handle) {
+  std::string prefix;
+  std::istringstream ss(tag);
+  if (!(ss >> prefix >> *handle) || prefix != WATCHER_LOCK_COOKIE_PREFIX) {
+    return false;
+  }
+  return true;
+}
+
+template <typename I>
+bool ExclusiveLock<I>::is_transition_state() const {
+  switch (m_state) {
+  case STATE_INITIALIZING:
+  case STATE_ACQUIRING:
+  case STATE_WAITING_FOR_PEER:
+  case STATE_POST_ACQUIRING:
+  case STATE_PRE_RELEASING:
+  case STATE_RELEASING:
+  case STATE_SHUTTING_DOWN:
+    return true;
+  case STATE_UNINITIALIZED:
+  case STATE_UNLOCKED:
+  case STATE_LOCKED:
+  case STATE_SHUTDOWN:
+    break;
+  }
+  return false;
+}
+
+template <typename I>
+void ExclusiveLock<I>::append_context(Action action, Context *ctx) {
+  assert(m_lock.is_locked());
+
+  for (auto &action_ctxs : m_actions_contexts) {
+    if (action == action_ctxs.first) {
+      if (ctx != nullptr) {
+        action_ctxs.second.push_back(ctx);
+      }
+      return;
+    }
+  }
+
+  Contexts contexts;
+  if (ctx != nullptr) {
+    contexts.push_back(ctx);
+  }
+  m_actions_contexts.push_back({action, std::move(contexts)});
+}
+
+template <typename I>
+void ExclusiveLock<I>::execute_action(Action action, Context *ctx) {
+  assert(m_lock.is_locked());
+
+  append_context(action, ctx);
+  if (!is_transition_state()) {
+    execute_next_action();
+  }
+}
+
+template <typename I>
+void ExclusiveLock<I>::execute_next_action() {
+  assert(m_lock.is_locked());
+  assert(!m_actions_contexts.empty());
+  switch (get_active_action()) {
+  case ACTION_TRY_LOCK:
+  case ACTION_REQUEST_LOCK:
+    send_acquire_lock();
+    break;
+  case ACTION_RELEASE_LOCK:
+    send_release_lock();
+    break;
+  case ACTION_SHUT_DOWN:
+    send_shutdown();
+    break;
+  default:
+    assert(false);
+    break;
+  }
+}
+
+template <typename I>
+typename ExclusiveLock<I>::Action ExclusiveLock<I>::get_active_action() const {
+  assert(m_lock.is_locked());
+  assert(!m_actions_contexts.empty());
+  return m_actions_contexts.front().first;
+}
+
+template <typename I>
+void ExclusiveLock<I>::complete_active_action(State next_state, int r) {
+  assert(m_lock.is_locked());
+  assert(!m_actions_contexts.empty());
+
+  ActionContexts action_contexts(std::move(m_actions_contexts.front()));
+  m_actions_contexts.pop_front();
+  m_state = next_state;
+
+  m_lock.Unlock();
+  for (auto ctx : action_contexts.second) {
+    ctx->complete(r);
+  }
+  m_lock.Lock();
+
+  if (!is_transition_state() && !m_actions_contexts.empty()) {
+    execute_next_action();
+  }
+}
+
+template <typename I>
+bool ExclusiveLock<I>::is_shutdown() const {
+  assert(m_lock.is_locked());
+
+  return ((m_state == STATE_SHUTDOWN) ||
+          (!m_actions_contexts.empty() &&
+           m_actions_contexts.back().first == ACTION_SHUT_DOWN));
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_init_complete() {
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  m_state = STATE_UNLOCKED;
+}
+
+template <typename I>
+void ExclusiveLock<I>::send_acquire_lock() {
+  assert(m_lock.is_locked());
+  if (m_state == STATE_LOCKED) {
+    complete_active_action(STATE_LOCKED, 0);
+    return;
+  }
+
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+  m_state = STATE_ACQUIRING;
+
+  m_watch_handle = m_image_ctx.image_watcher->get_watch_handle();
+
+  using el = ExclusiveLock<I>;
+  AcquireRequest<I>* req = AcquireRequest<I>::create(
+    m_image_ctx, encode_lock_cookie(),
+    util::create_context_callback<el, &el::handle_acquiring_lock>(this),
+    util::create_context_callback<el, &el::handle_acquire_lock>(this));
+
+  m_lock.Unlock();
+  req->send();
+  m_lock.Lock();
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_acquiring_lock(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  assert(r == 0);
+  assert(m_state == STATE_ACQUIRING);
+
+  // lock is owned at this point
+  m_state = STATE_POST_ACQUIRING;
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_acquire_lock(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  if (r == -EBUSY || r == -EAGAIN) {
+    ldout(cct, 5) << "unable to acquire exclusive lock" << dendl;
+  } else if (r < 0) {
+    lderr(cct) << "failed to acquire exclusive lock:" << cpp_strerror(r)
+               << dendl;
+  } else {
+    ldout(cct, 5) << "successfully acquired exclusive lock" << dendl;
+  }
+
+  {
+    m_lock.Lock();
+    assert(m_state == STATE_ACQUIRING ||
+           m_state == STATE_POST_ACQUIRING);
+
+    Action action = get_active_action();
+    assert(action == ACTION_TRY_LOCK || action == ACTION_REQUEST_LOCK);
+    if (action == ACTION_REQUEST_LOCK && r < 0 && r != -EBLACKLISTED) {
+      m_state = STATE_WAITING_FOR_PEER;
+      m_lock.Unlock();
+
+      // request the lock from a peer
+      m_image_ctx.image_watcher->notify_request_lock();
+      return;
+    }
+    m_lock.Unlock();
+  }
+
+  State next_state = (r < 0 ? STATE_UNLOCKED : STATE_LOCKED);
+  if (r == -EAGAIN) {
+    r = 0;
+  }
+
+  if (next_state == STATE_LOCKED) {
+    m_image_ctx.image_watcher->notify_acquired_lock();
+    m_image_ctx.aio_work_queue->unblock_writes();
+  }
+
+  Mutex::Locker locker(m_lock);
+  complete_active_action(next_state, r);
+}
+
+template <typename I>
+void ExclusiveLock<I>::send_release_lock() {
+  assert(m_lock.is_locked());
+  if (m_state == STATE_UNLOCKED) {
+    complete_active_action(STATE_UNLOCKED, 0);
+    return;
+  }
+
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+  m_state = STATE_PRE_RELEASING;
+
+  using el = ExclusiveLock<I>;
+  ReleaseRequest<I>* req = ReleaseRequest<I>::create(
+    m_image_ctx, encode_lock_cookie(),
+    util::create_context_callback<el, &el::handle_releasing_lock>(this),
+    util::create_context_callback<el, &el::handle_release_lock>(this));
+
+  // send in alternate thread context to avoid re-entrant locking
+  m_image_ctx.op_work_queue->queue(new C_SendReleaseRequest<I>(req), 0);
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_releasing_lock(int r) {
+  Mutex::Locker locker(m_lock);
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+
+  assert(r == 0);
+  assert(m_state == STATE_PRE_RELEASING);
+
+  // all IO and ops should be blocked/canceled by this point
+  m_state = STATE_RELEASING;
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_release_lock(int r) {
+  bool pending_writes = false;
+  {
+    Mutex::Locker locker(m_lock);
+    ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": r=" << r
+                               << dendl;
+
+    assert(m_state == STATE_PRE_RELEASING ||
+           m_state == STATE_RELEASING);
+    if (r >= 0) {
+      m_lock.Unlock();
+      m_image_ctx.image_watcher->notify_released_lock();
+      pending_writes = !m_image_ctx.aio_work_queue->writes_empty();
+      m_lock.Lock();
+
+      m_watch_handle = 0;
+    }
+    complete_active_action(r < 0 ? STATE_LOCKED : STATE_UNLOCKED, r);
+  }
+
+  if (r >= 0 && pending_writes) {
+    // if we have pending writes -- re-request the lock
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    request_lock(nullptr);
+  }
+}
+
+template <typename I>
+void ExclusiveLock<I>::send_shutdown() {
+  assert(m_lock.is_locked());
+  if (m_state == STATE_UNLOCKED) {
+    m_state = STATE_SHUTTING_DOWN;
+    m_image_ctx.aio_work_queue->unblock_writes();
+    m_image_ctx.op_work_queue->queue(util::create_context_callback<
+      ExclusiveLock<I>, &ExclusiveLock<I>::complete_shutdown>(this), 0);
+    return;
+  }
+
+  ldout(m_image_ctx.cct, 10) << this << " " << __func__ << dendl;
+  assert(m_state == STATE_LOCKED);
+  m_state = STATE_SHUTTING_DOWN;
+
+  m_lock.Unlock();
+  m_image_ctx.op_work_queue->queue(new C_ShutDownRelease(this), 0);
+  m_lock.Lock();
+}
+
+template <typename I>
+void ExclusiveLock<I>::send_shutdown_release() {
+  std::string cookie;
+  {
+    Mutex::Locker locker(m_lock);
+    cookie = encode_lock_cookie();
+  }
+
+  using el = ExclusiveLock<I>;
+  ReleaseRequest<I>* req = ReleaseRequest<I>::create(
+    m_image_ctx, cookie, nullptr,
+    util::create_context_callback<el, &el::handle_shutdown>(this));
+  req->send();
+}
+
+template <typename I>
+void ExclusiveLock<I>::handle_shutdown(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "failed to shut down exclusive lock: " << cpp_strerror(r)
+               << dendl;
+  } else {
+    m_image_ctx.aio_work_queue->unblock_writes();
+  }
+
+  m_image_ctx.image_watcher->notify_released_lock();
+  complete_shutdown(r);
+}
+
+template <typename I>
+void ExclusiveLock<I>::complete_shutdown(int r) {
+  ActionContexts action_contexts;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_lock.is_locked());
+    assert(m_actions_contexts.size() == 1);
+
+    action_contexts = std::move(m_actions_contexts.front());
+    m_actions_contexts.pop_front();
+    m_state = STATE_SHUTDOWN;
+  }
+
+  // expect to be destroyed after firing callback
+  for (auto ctx : action_contexts.second) {
+    ctx->complete(r);
+  }
+}
+
+} // namespace librbd
+
+template class librbd::ExclusiveLock<librbd::ImageCtx>;
diff --git a/src/librbd/ExclusiveLock.h b/src/librbd/ExclusiveLock.h
new file mode 100644
index 0000000..f29ae59
--- /dev/null
+++ b/src/librbd/ExclusiveLock.h
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "common/RWLock.h"
+#include <list>
+#include <string>
+#include <utility>
+
+namespace librbd {
+
+class ImageCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class ExclusiveLock {
+public:
+  static const std::string WATCHER_LOCK_TAG;
+
+  static ExclusiveLock *create(ImageCtxT &image_ctx) {
+    return new ExclusiveLock<ImageCtxT>(image_ctx);
+  }
+
+  ExclusiveLock(ImageCtxT &image_ctx);
+  ~ExclusiveLock();
+
+  bool is_lock_owner() const;
+  bool accept_requests() const;
+
+  void init(Context *on_init);
+  void shut_down(Context *on_shutdown);
+
+  void try_lock(Context *on_tried_lock);
+  void request_lock(Context *on_locked);
+  void release_lock(Context *on_released);
+
+  void handle_lock_released();
+
+  void assert_header_locked(librados::ObjectWriteOperation *op);
+
+  static bool decode_lock_cookie(const std::string &cookie, uint64_t *handle);
+
+private:
+
+  /**
+   * <start>                               WAITING_FOR_PEER -----------------\
+   *    |                                     ^                              |
+   *    |                                     *  (request_lock busy)         |
+   *    |                                     * * * * * * * * * * * *        |
+   *    |                                                           *        |
+   *    v            (init)            (try_lock/request_lock)      *        |
+   * UNINITIALIZED  -------> UNLOCKED ------------------------> ACQUIRING <--/
+   *                            ^                                   |
+   *                            |                                   v
+   *                         RELEASING                        POST_ACQUIRING
+   *                            |                                   |
+   *                            |                                   |
+   *                            |          (release_lock)           v
+   *                      PRE_RELEASING <------------------------ LOCKED
+   *
+   * <UNLOCKED/LOCKED states>
+   *    |
+   *    |
+   *    v
+   * SHUTTING_DOWN ---> SHUTDOWN ---> <finish>
+   */
+  enum State {
+    STATE_UNINITIALIZED,
+    STATE_UNLOCKED,
+    STATE_LOCKED,
+    STATE_INITIALIZING,
+    STATE_ACQUIRING,
+    STATE_POST_ACQUIRING,
+    STATE_WAITING_FOR_PEER,
+    STATE_PRE_RELEASING,
+    STATE_RELEASING,
+    STATE_SHUTTING_DOWN,
+    STATE_SHUTDOWN,
+  };
+
+  enum Action {
+    ACTION_TRY_LOCK,
+    ACTION_REQUEST_LOCK,
+    ACTION_RELEASE_LOCK,
+    ACTION_SHUT_DOWN
+  };
+
+  typedef std::list<Context *> Contexts;
+  typedef std::pair<Action, Contexts> ActionContexts;
+  typedef std::list<ActionContexts> ActionsContexts;
+
+  struct C_InitComplete : public Context {
+    ExclusiveLock *exclusive_lock;
+    Context *on_init;
+    C_InitComplete(ExclusiveLock *exclusive_lock, Context *on_init)
+      : exclusive_lock(exclusive_lock), on_init(on_init) {
+    }
+    virtual void finish(int r) override {
+      if (r == 0) {
+        exclusive_lock->handle_init_complete();
+      }
+      on_init->complete(r);
+    }
+  };
+
+  struct C_ShutDownRelease : public Context {
+    ExclusiveLock *exclusive_lock;
+    C_ShutDownRelease(ExclusiveLock *exclusive_lock)
+      : exclusive_lock(exclusive_lock) {
+    }
+    virtual void finish(int r) override {
+      exclusive_lock->send_shutdown_release();
+    }
+  };
+
+  ImageCtxT &m_image_ctx;
+
+  mutable Mutex m_lock;
+  State m_state;
+  uint64_t m_watch_handle;
+
+  ActionsContexts m_actions_contexts;
+
+  std::string encode_lock_cookie() const;
+
+  bool is_transition_state() const;
+
+  void append_context(Action action, Context *ctx);
+  void execute_action(Action action, Context *ctx);
+  void execute_next_action();
+
+  Action get_active_action() const;
+  void complete_active_action(State next_state, int r);
+
+  bool is_shutdown() const;
+
+  void handle_init_complete();
+
+  void send_acquire_lock();
+  void handle_acquiring_lock(int r);
+  void handle_acquire_lock(int r);
+
+  void send_release_lock();
+  void handle_releasing_lock(int r);
+  void handle_release_lock(int r);
+
+  void send_shutdown();
+  void send_shutdown_release();
+  void handle_shutdown(int r);
+  void complete_shutdown(int r);
+};
+
+} // namespace librbd
+
+extern template class librbd::ExclusiveLock<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_H
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 8fd13e4..154c15a 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -11,15 +11,18 @@
 #include "common/WorkQueue.h"
 
 #include "librbd/AioImageRequestWQ.h"
+#include "librbd/AioCompletion.h"
 #include "librbd/AsyncOperation.h"
 #include "librbd/AsyncRequest.h"
-#include "librbd/AsyncResizeRequest.h"
 #include "librbd/internal.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/Journal.h"
 #include "librbd/LibrbdAdminSocketHook.h"
 #include "librbd/ObjectMap.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/Utils.h"
 
 #include <boost/bind.hpp>
 
@@ -67,6 +70,19 @@ struct C_FlushCache : public Context {
   }
 };
 
+struct C_ShutDownCache : public Context {
+  ImageCtx *image_ctx;
+  Context *on_finish;
+
+  C_ShutDownCache(ImageCtx *_image_ctx, Context *_on_finish)
+    : image_ctx(_image_ctx), on_finish(_on_finish) {
+  }
+  virtual void finish(int r) {
+    image_ctx->object_cacher->stop();
+    on_finish->complete(r);
+  }
+};
+
 struct C_InvalidateCache : public Context {
   ImageCtx *image_ctx;
   bool purge_on_error;
@@ -128,17 +144,15 @@ struct C_InvalidateCache : public Context {
       name(image_name),
       image_watcher(NULL),
       journal(NULL),
-      refresh_seq(0),
-      last_refresh(0),
-      owner_lock(unique_lock_name("librbd::ImageCtx::owner_lock", this)),
-      md_lock(unique_lock_name("librbd::ImageCtx::md_lock", this)),
-      cache_lock(unique_lock_name("librbd::ImageCtx::cache_lock", this)),
-      snap_lock(unique_lock_name("librbd::ImageCtx::snap_lock", this)),
-      parent_lock(unique_lock_name("librbd::ImageCtx::parent_lock", this)),
-      refresh_lock(unique_lock_name("librbd::ImageCtx::refresh_lock", this)),
-      object_map_lock(unique_lock_name("librbd::ImageCtx::object_map_lock", this)),
-      async_ops_lock(unique_lock_name("librbd::ImageCtx::async_ops_lock", this)),
-      copyup_list_lock(unique_lock_name("librbd::ImageCtx::copyup_list_lock", this)),
+      owner_lock(util::unique_lock_name("librbd::ImageCtx::owner_lock", this)),
+      md_lock(util::unique_lock_name("librbd::ImageCtx::md_lock", this)),
+      cache_lock(util::unique_lock_name("librbd::ImageCtx::cache_lock", this)),
+      snap_lock(util::unique_lock_name("librbd::ImageCtx::snap_lock", this)),
+      parent_lock(util::unique_lock_name("librbd::ImageCtx::parent_lock", this)),
+      object_map_lock(util::unique_lock_name("librbd::ImageCtx::object_map_lock", this)),
+      async_ops_lock(util::unique_lock_name("librbd::ImageCtx::async_ops_lock", this)),
+      copyup_list_lock(util::unique_lock_name("librbd::ImageCtx::copyup_list_lock", this)),
+      completed_reqs_lock(util::unique_lock_name("librbd::ImageCtx::completed_reqs_lock", this)),
       extra_read_flags(0),
       old_format(true),
       order(0), size(0), features(0),
@@ -147,9 +161,10 @@ struct C_InvalidateCache : public Context {
       stripe_unit(0), stripe_count(0), flags(0),
       object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
       readahead(),
-      total_bytes_read(0), copyup_finisher(NULL),
-      object_map(*this), aio_work_queue(NULL), op_work_queue(NULL),
-      refresh_in_progress(false), asok_hook(new LibrbdAdminSocketHook(this))
+      total_bytes_read(0),
+      state(new ImageState<>(this)), exclusive_lock(nullptr),
+      object_map(nullptr), aio_work_queue(NULL), op_work_queue(NULL),
+      asok_hook(new LibrbdAdminSocketHook(this))
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
@@ -171,7 +186,11 @@ struct C_InvalidateCache : public Context {
   }
 
   ImageCtx::~ImageCtx() {
+    assert(image_watcher == NULL);
+    assert(exclusive_lock == NULL);
+    assert(object_map == NULL);
     assert(journal == NULL);
+
     if (perfcounter) {
       perf_stop();
     }
@@ -187,64 +206,26 @@ struct C_InvalidateCache : public Context {
       delete object_set;
       object_set = NULL;
     }
-    if (copyup_finisher != NULL) {
-      delete copyup_finisher;
-      copyup_finisher = NULL;
-    }
     delete[] format_string;
 
+    md_ctx.aio_flush();
+    data_ctx.aio_flush();
+    op_work_queue->drain();
+    aio_work_queue->drain();
+
     delete op_work_queue;
     delete aio_work_queue;
-
     delete asok_hook;
+    delete state;
   }
 
-  int ImageCtx::init() {
-    int r;
-
-    if (id.length()) {
-      old_format = false;
-    } else {
-      r = detect_format(md_ctx, name, &old_format, NULL);
-      if (r < 0) {
-	lderr(cct) << "error finding header: " << cpp_strerror(r) << dendl;
-	return r;
-      }
-    }
-
+  void ImageCtx::init() {
+    assert(!header_oid.empty());
+    assert(old_format || !id.empty());
     if (!old_format) {
-      if (!id.length()) {
-	r = cls_client::get_id(&md_ctx, id_obj_name(name), &id);
-	if (r < 0) {
-	  lderr(cct) << "error reading image id: " << cpp_strerror(r)
-		     << dendl;
-	  return r;
-	}
-      }
-
-      header_oid = header_name(id);
-      apply_metadata_confs();
-      r = cls_client::get_immutable_metadata(&md_ctx, header_oid,
-					     &object_prefix, &order);
-      if (r < 0) {
-	lderr(cct) << "error reading immutable metadata: "
-		   << cpp_strerror(r) << dendl;
-	return r;
-      }
-
-      r = cls_client::get_stripe_unit_count(&md_ctx, header_oid,
-					    &stripe_unit, &stripe_count);
-      if (r < 0 && r != -ENOEXEC && r != -EINVAL) {
-	lderr(cct) << "error reading striping metadata: "
-		   << cpp_strerror(r) << dendl;
-	return r;
-      }
-
       init_layout();
-    } else {
-      apply_metadata_confs();
-      header_oid = old_header_name(name);
     }
+    apply_metadata_confs();
 
     string pname = string("librbd-") + id + string("-") +
       data_ctx.get_pool_name() + string("-") + name;
@@ -294,15 +275,8 @@ struct C_InvalidateCache : public Context {
       object_cacher->start();
     }
 
-    if (clone_copy_on_read) {
-      copyup_finisher = new Finisher(cct);
-      copyup_finisher->start();
-    }
-
     readahead.set_trigger_requests(readahead_trigger_requests);
     readahead.set_max_readahead_size(readahead_max_bytes);
-
-    return 0;
   }
 
   void ImageCtx::init_layout()
@@ -401,7 +375,6 @@ struct C_InvalidateCache : public Context {
       snap_name = in_snap_name;
       snap_exists = true;
       data_ctx.snap_set_read(snap_id);
-      object_map.refresh(in_snap_id);
       return 0;
     }
     return -ENOENT;
@@ -414,7 +387,6 @@ struct C_InvalidateCache : public Context {
     snap_name = "";
     snap_exists = true;
     data_ctx.snap_set_read(snap_id);
-    object_map.refresh(CEPH_NOSNAP);
   }
 
   snap_t ImageCtx::get_snap_id(string in_snap_name) const
@@ -541,9 +513,9 @@ struct C_InvalidateCache : public Context {
   {
     assert(snap_lock.is_locked());
     if (in_snap_id == CEPH_NOSNAP) {
-      if (!async_resize_reqs.empty() &&
-          async_resize_reqs.front()->shrinking()) {
-        return async_resize_reqs.front()->get_image_size();
+      if (!resize_reqs.empty() &&
+          resize_reqs.front()->shrinking()) {
+        return resize_reqs.front()->get_image_size();
       }
       return size;
     }
@@ -555,10 +527,17 @@ struct C_InvalidateCache : public Context {
     return 0;
   }
 
-  bool ImageCtx::test_features(uint64_t test_features) const
+  bool ImageCtx::test_features(uint64_t features) const
   {
     RWLock::RLocker l(snap_lock);
-    return ((features & test_features) == test_features);
+    return test_features(features, snap_lock);
+  }
+
+  bool ImageCtx::test_features(uint64_t in_features,
+                               const RWLock &in_snap_lock) const
+  {
+    assert(snap_lock.is_locked());
+    return ((features & in_features) == in_features);
   }
 
   int ImageCtx::get_flags(librados::snap_t _snap_id, uint64_t *_flags) const
@@ -576,12 +555,18 @@ struct C_InvalidateCache : public Context {
     return -ENOENT;
   }
 
-  bool ImageCtx::test_flags(uint64_t test_flags) const
+  bool ImageCtx::test_flags(uint64_t flags) const
   {
     RWLock::RLocker l(snap_lock);
+    return test_flags(flags, snap_lock);
+  }
+
+  bool ImageCtx::test_flags(uint64_t flags, const RWLock &in_snap_lock) const
+  {
+    assert(snap_lock.is_locked());
     uint64_t snap_flags;
     get_flags(snap_id, &snap_flags);
-    return ((snap_flags & test_flags) == test_flags);
+    return ((snap_flags & flags) == flags);
   }
 
   int ImageCtx::update_flags(snap_t in_snap_id, uint64_t flag, bool enabled)
@@ -722,17 +707,6 @@ struct C_InvalidateCache : public Context {
     }
   }
 
-  int ImageCtx::flush_cache() {
-    C_SaferCond cond_ctx;
-    flush_cache(&cond_ctx);
-
-    ldout(cct, 20) << "waiting for cache to be flushed" << dendl;
-    int r = cond_ctx.wait();
-    ldout(cct, 20) << "finished flushing cache" << dendl;
-
-    return r;
-  }
-
   void ImageCtx::flush_cache(Context *onfinish) {
     assert(owner_lock.is_locked());
     cache_lock.Lock();
@@ -740,13 +714,19 @@ struct C_InvalidateCache : public Context {
     cache_lock.Unlock();
   }
 
-  int ImageCtx::shutdown_cache() {
-    flush_async_operations();
+  void ImageCtx::shut_down_cache(Context *on_finish) {
+    if (object_cacher == NULL) {
+      on_finish->complete(0);
+      return;
+    }
 
     RWLock::RLocker owner_locker(owner_lock);
-    int r = invalidate_cache(true);
-    object_cacher->stop();
-    return r;
+    cache_lock.Lock();
+    object_cacher->release_set(object_set);
+    cache_lock.Unlock();
+
+    C_ShutDownCache *shut_down = new C_ShutDownCache(this, on_finish);
+    flush_cache(new C_InvalidateCache(this, true, false, shut_down));
   }
 
   int ImageCtx::invalidate_cache(bool purge_on_error) {
@@ -789,7 +769,6 @@ struct C_InvalidateCache : public Context {
   int ImageCtx::register_watch() {
     assert(image_watcher == NULL);
     image_watcher = new ImageWatcher(*this);
-    aio_work_queue->register_lock_listener();
     return image_watcher->register_watch();
   }
 
@@ -848,6 +827,9 @@ struct C_InvalidateCache : public Context {
   }
 
   void ImageCtx::flush(Context *on_safe) {
+    // ensure no locks are held when flush is complete
+    on_safe = util::create_async_context_callback(*this, on_safe);
+
     assert(owner_lock.is_locked());
     if (object_cacher != NULL) {
       // flush cache after completing all in-flight AIO ops
@@ -857,19 +839,34 @@ struct C_InvalidateCache : public Context {
   }
 
   void ImageCtx::cancel_async_requests() {
-    Mutex::Locker l(async_ops_lock);
-    ldout(cct, 10) << "canceling async requests: count="
-                   << async_requests.size() << dendl;
+    C_SaferCond ctx;
+    cancel_async_requests(&ctx);
+    ctx.wait();
+  }
 
-    for (xlist<AsyncRequest<>*>::iterator it = async_requests.begin();
-         !it.end(); ++it) {
-      ldout(cct, 10) << "canceling async request: " << *it << dendl;
-      (*it)->cancel();
+  void ImageCtx::cancel_async_requests(Context *on_finish) {
+    {
+      Mutex::Locker async_ops_locker(async_ops_lock);
+      if (!async_requests.empty()) {
+        ldout(cct, 10) << "canceling async requests: count="
+                       << async_requests.size() << dendl;
+        for (auto req : async_requests) {
+          ldout(cct, 10) << "canceling async request: " << req << dendl;
+          req->cancel();
+        }
+        async_requests_waiters.push_back(on_finish);
+        return;
+      }
     }
 
-    while (!async_requests.empty()) {
-      async_requests_cond.Wait(async_ops_lock);
-    }
+    on_finish->complete(0);
+  }
+
+  void ImageCtx::clear_pending_completions() {
+    Mutex::Locker l(completed_reqs_lock);
+    ldout(cct, 10) << "clear pending AioCompletion: count="
+                   << completed_reqs.size() << dendl;
+    completed_reqs.clear();
   }
 
   bool ImageCtx::_filter_metadata_confs(const string &prefix, map<string, bool> &configs,
@@ -918,7 +915,14 @@ struct C_InvalidateCache : public Context {
         "rbd_clone_copy_on_read", false)(
         "rbd_blacklist_on_break_lock", false)(
         "rbd_blacklist_expire_seconds", false)(
-        "rbd_request_timed_out_seconds", false);
+        "rbd_request_timed_out_seconds", false)(
+        "rbd_journal_order", false)(
+        "rbd_journal_splay_width", false)(
+        "rbd_journal_commit_age", false)(
+        "rbd_journal_object_flush_interval", false)(
+        "rbd_journal_object_flush_bytes", false)(
+        "rbd_journal_object_flush_age", false)(
+        "rbd_journal_pool", false);
 
     string start = METADATA_CONF_PREFIX;
     int r = 0, j = 0;
@@ -988,25 +992,20 @@ struct C_InvalidateCache : public Context {
     ASSIGN_OPTION(blacklist_expire_seconds);
     ASSIGN_OPTION(request_timed_out_seconds);
     ASSIGN_OPTION(enable_alloc_hint);
+    ASSIGN_OPTION(journal_order);
+    ASSIGN_OPTION(journal_splay_width);
+    ASSIGN_OPTION(journal_commit_age);
+    ASSIGN_OPTION(journal_object_flush_interval);
+    ASSIGN_OPTION(journal_object_flush_bytes);
+    ASSIGN_OPTION(journal_object_flush_age);
+    ASSIGN_OPTION(journal_pool);
   }
 
-  void ImageCtx::open_journal() {
-    assert(journal == NULL);
-    journal = new Journal(*this);
+  ObjectMap *ImageCtx::create_object_map(uint64_t snap_id) {
+    return new ObjectMap(*this, snap_id);
   }
 
-  int ImageCtx::close_journal(bool force) {
-    assert(journal != NULL);
-    int r = journal->close();
-    if (r < 0) {
-      lderr(cct) << "failed to flush journal: " << cpp_strerror(r) << dendl;
-      if (!force) {
-        return r;
-      }
-    }
-
-    delete journal;
-    journal = NULL;
-    return r;
+  Journal *ImageCtx::create_journal() {
+    return new Journal(*this);
   }
 }
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index f61929c..715fcbb 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -5,6 +5,7 @@
 
 #include "include/int_types.h"
 
+#include <list>
 #include <map>
 #include <set>
 #include <string>
@@ -12,12 +13,13 @@
 #include <boost/optional.hpp>
 
 #include "common/Cond.h"
+#include "common/event_socket.h"
 #include "common/Mutex.h"
 #include "common/Readahead.h"
 #include "common/RWLock.h"
 #include "common/snap_types.h"
 #include "include/atomic.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/rbd/librbd.hpp"
 #include "include/rbd_types.h"
 #include "include/types.h"
@@ -25,8 +27,8 @@
 #include "osdc/ObjectCacher.h"
 
 #include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsyncRequest.h"
 #include "librbd/LibrbdWriteback.h"
-#include "librbd/ObjectMap.h"
 #include "librbd/SnapInfo.h"
 #include "librbd/parent_types.h"
 
@@ -37,14 +39,21 @@ class PerfCounters;
 
 namespace librbd {
 
+  struct ImageCtx;
+  class AioCompletion;
   class AioImageRequestWQ;
   class AsyncOperation;
-  template <typename ImageCtxT> class AsyncRequest;
-  class AsyncResizeRequest;
   class CopyupRequest;
-  class LibrbdAdminSocketHook;
+  template <typename> class ExclusiveLock;
+  template <typename> class ImageState;
   class ImageWatcher;
   class Journal;
+  class LibrbdAdminSocketHook;
+  class ObjectMap;
+
+  namespace operation {
+  template <typename> class ResizeRequest;
+  }
 
   struct ImageCtx {
     CephContext *cct;
@@ -71,14 +80,12 @@ namespace librbd {
     IoCtx data_ctx, md_ctx;
     ImageWatcher *image_watcher;
     Journal *journal;
-    int refresh_seq;    ///< sequence for refresh requests
-    int last_refresh;   ///< last completed refresh
 
     /**
      * Lock ordering:
      *
      * owner_lock, md_lock, cache_lock, snap_lock, parent_lock,
-     * refresh_lock, object_map_lock, async_op_lock
+     * object_map_lock, async_op_lock
      */
     RWLock owner_lock; // protects exclusive lock leadership updates
     RWLock md_lock; // protects access to the mutable image metadata that
@@ -91,12 +98,13 @@ namespace librbd {
                    // lock_tag
                    // lockers
     Mutex cache_lock; // used as client_lock for the ObjectCacher
-    RWLock snap_lock; // protects snapshot-related member variables, features, and flags
+    RWLock snap_lock; // protects snapshot-related member variables,
+                      // features (and associated helper classes), and flags
     RWLock parent_lock; // protects parent_md and parent
-    Mutex refresh_lock; // protects refresh_seq and last_refresh
     RWLock object_map_lock; // protects object map updates and object_map itself
     Mutex async_ops_lock; // protects async_ops and async_requests
     Mutex copyup_list_lock; // protects copyup_waiting_list
+    Mutex completed_reqs_lock; // protects completed_reqs
 
     unsigned extra_read_flags;
 
@@ -122,24 +130,25 @@ namespace librbd {
     Readahead readahead;
     uint64_t total_bytes_read;
 
-    Finisher *copyup_finisher;
     std::map<uint64_t, CopyupRequest*> copyup_list;
 
     xlist<AsyncOperation*> async_ops;
     xlist<AsyncRequest<>*> async_requests;
-    Cond async_requests_cond;
+    std::list<Context*> async_requests_waiters;
 
-    ObjectMap object_map;
+    ImageState<ImageCtx> *state;
+    ExclusiveLock<ImageCtx> *exclusive_lock;
+    ObjectMap *object_map;
 
     atomic_t async_request_seq;
 
-    xlist<AsyncResizeRequest*> async_resize_reqs;
+    xlist<operation::ResizeRequest<ImageCtx>*> resize_reqs;
 
     AioImageRequestWQ *aio_work_queue;
-    ContextWQ *op_work_queue;
+    xlist<AioCompletion*> completed_reqs;
+    EventSocket event_socket;
 
-    Cond refresh_cond;
-    bool refresh_in_progress;
+    ContextWQ *op_work_queue;
 
     // Configuration
     static const string METADATA_CONF_PREFIX;
@@ -165,6 +174,13 @@ namespace librbd {
     uint32_t blacklist_expire_seconds;
     uint32_t request_timed_out_seconds;
     bool enable_alloc_hint;
+    uint8_t journal_order;
+    uint8_t journal_splay_width;
+    double journal_commit_age;
+    int journal_object_flush_interval;
+    uint64_t journal_object_flush_bytes;
+    double journal_object_flush_age;
+    std::string journal_pool;
 
     LibrbdAdminSocketHook *asok_hook;
 
@@ -179,7 +195,7 @@ namespace librbd {
     ImageCtx(const std::string &image_name, const std::string &image_id,
 	     const char *snap, IoCtx& p, bool read_only);
     ~ImageCtx();
-    int init();
+    void init();
     void init_layout();
     void perf_start(std::string name);
     void perf_stop();
@@ -211,8 +227,11 @@ namespace librbd {
     void rm_snap(std::string in_snap_name, librados::snap_t id);
     uint64_t get_image_size(librados::snap_t in_snap_id) const;
     bool test_features(uint64_t test_features) const;
+    bool test_features(uint64_t test_features,
+                       const RWLock &in_snap_lock) const;
     int get_flags(librados::snap_t in_snap_id, uint64_t *flags) const;
     bool test_flags(uint64_t test_flags) const;
+    bool test_flags(uint64_t test_flags, const RWLock &in_snap_lock) const;
     int update_flags(librados::snap_t in_snap_id, uint64_t flag, bool enabled);
 
     const parent_info* get_parent_info(librados::snap_t in_snap_id) const;
@@ -229,9 +248,8 @@ namespace librbd {
 			uint64_t off, Context *onfinish, int fadvise_flags,
                         uint64_t journal_tid);
     void user_flushed();
-    int flush_cache();
     void flush_cache(Context *onfinish);
-    int shutdown_cache();
+    void shut_down_cache(Context *on_finish);
     int invalidate_cache(bool purge_on_error=false);
     void invalidate_cache(Context *on_finish);
     void clear_nonexistence_cache();
@@ -247,10 +265,14 @@ namespace librbd {
     void flush(Context *on_safe);
 
     void cancel_async_requests();
+    void cancel_async_requests(Context *on_finish);
+
     void apply_metadata_confs();
 
-    void open_journal();
-    int close_journal(bool force);
+    ObjectMap *create_object_map(uint64_t snap_id);
+    Journal *create_journal();
+
+    void clear_pending_completions();
   };
 }
 
diff --git a/src/librbd/ImageState.cc b/src/librbd/ImageState.cc
new file mode 100644
index 0000000..1bf628e
--- /dev/null
+++ b/src/librbd/ImageState.cc
@@ -0,0 +1,389 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/ImageState.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/OpenRequest.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ImageState: "
+
+namespace librbd {
+
+using util::create_context_callback;
+
+template <typename I>
+ImageState<I>::ImageState(I *image_ctx)
+  : m_image_ctx(image_ctx), m_state(STATE_UNINITIALIZED),
+    m_lock(util::unique_lock_name("librbd::ImageState::m_lock", this)),
+    m_last_refresh(0), m_refresh_seq(0) {
+}
+
+template <typename I>
+ImageState<I>::~ImageState() {
+  assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED);
+}
+
+template <typename I>
+int ImageState<I>::open() {
+  C_SaferCond ctx;
+  open(&ctx);
+  return ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::open(Context *on_finish) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(m_state == STATE_UNINITIALIZED);
+
+  Action action(ACTION_TYPE_OPEN);
+  action.refresh_seq = m_refresh_seq;
+  execute_action(action, on_finish);
+}
+
+template <typename I>
+int ImageState<I>::close() {
+  C_SaferCond ctx;
+  close(&ctx);
+
+  int r = ctx.wait();
+  delete m_image_ctx;
+  return r;
+}
+
+template <typename I>
+void ImageState<I>::close(Context *on_finish) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(!is_closed());
+
+  Action action(ACTION_TYPE_CLOSE);
+  action.refresh_seq = m_refresh_seq;
+  execute_action(action, on_finish);
+}
+
+template <typename I>
+void ImageState<I>::handle_update_notification() {
+  Mutex::Locker locker(m_lock);
+  ++m_refresh_seq;
+
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << "refresh_seq = " << m_refresh_seq << ", "
+		 << "last_refresh = " << m_last_refresh << dendl;
+}
+
+template <typename I>
+bool ImageState<I>::is_refresh_required() const {
+  Mutex::Locker locker(m_lock);
+  return (m_last_refresh != m_refresh_seq);
+}
+
+template <typename I>
+int ImageState<I>::refresh() {
+  C_SaferCond refresh_ctx;
+  refresh(&refresh_ctx);
+  return refresh_ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::refresh(Context *on_finish) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << __func__ << dendl;
+
+  m_lock.Lock();
+  if (is_closed()) {
+    m_lock.Unlock();
+    on_finish->complete(0);
+    return;
+  }
+
+  Action action(ACTION_TYPE_REFRESH);
+  action.refresh_seq = m_refresh_seq;
+  execute_action(action, on_finish);
+  m_lock.Unlock();
+}
+
+template <typename I>
+int ImageState<I>::refresh_if_required() {
+  RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+  return refresh_if_required(m_image_ctx->owner_lock);
+}
+
+template <typename I>
+int ImageState<I>::refresh_if_required(const RWLock &) {
+  assert(m_image_ctx->owner_lock.is_locked());
+
+  C_SaferCond ctx;
+  {
+    Mutex::Locker locker(m_lock);
+    if (m_last_refresh == m_refresh_seq || is_closed()) {
+      return 0;
+    }
+
+    Action action(ACTION_TYPE_REFRESH);
+    action.refresh_seq = m_refresh_seq;
+    execute_action(action, &ctx);
+  }
+
+  return ctx.wait();
+}
+
+template <typename I>
+void ImageState<I>::snap_set(const std::string &snap_name, Context *on_finish) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << __func__ << ": snap_name=" << snap_name << dendl;
+
+  Mutex::Locker locker(m_lock);
+  Action action(ACTION_TYPE_SET_SNAP);
+  action.snap_name = snap_name;
+  execute_action(action, on_finish);
+}
+
+template <typename I>
+bool ImageState<I>::is_transition_state() const {
+  switch (m_state) {
+  case STATE_UNINITIALIZED:
+  case STATE_OPEN:
+  case STATE_CLOSED:
+    return false;
+  case STATE_OPENING:
+  case STATE_CLOSING:
+  case STATE_REFRESHING:
+  case STATE_SETTING_SNAP:
+    break;
+  }
+  return true;
+}
+
+template <typename I>
+bool ImageState<I>::is_closed() const {
+  assert(m_lock.is_locked());
+
+  return ((m_state == STATE_CLOSED) ||
+          (!m_actions_contexts.empty() &&
+           m_actions_contexts.back().first.action_type == ACTION_TYPE_CLOSE));
+}
+
+template <typename I>
+void ImageState<I>::append_context(const Action &action, Context *context) {
+  assert(m_lock.is_locked());
+
+  ActionContexts *action_contexts = nullptr;
+  for (auto &action_ctxs : m_actions_contexts) {
+    if (action == action_ctxs.first) {
+      action_contexts = &action_ctxs;
+      break;
+    }
+  }
+
+  if (action_contexts == nullptr) {
+    m_actions_contexts.push_back({action, {}});
+    action_contexts = &m_actions_contexts.back();
+  }
+
+  if (context != nullptr) {
+    action_contexts->second.push_back(context);
+  }
+}
+
+template <typename I>
+void ImageState<I>::execute_next_action() {
+  assert(m_lock.is_locked());
+  assert(!m_actions_contexts.empty());
+  switch (m_actions_contexts.front().first.action_type) {
+  case ACTION_TYPE_OPEN:
+    send_open();
+    return;
+  case ACTION_TYPE_CLOSE:
+    send_close();
+    return;
+  case ACTION_TYPE_REFRESH:
+    send_refresh();
+    return;
+  case ACTION_TYPE_SET_SNAP:
+    send_set_snap();
+    return;
+  }
+  assert(false);
+}
+
+template <typename I>
+void ImageState<I>::execute_action(const Action &action, Context *on_finish) {
+  assert(m_lock.is_locked());
+
+  append_context(action, on_finish);
+  if (!is_transition_state()) {
+    execute_next_action();
+  }
+}
+
+template <typename I>
+void ImageState<I>::complete_action(State next_state, int r) {
+  assert(m_lock.is_locked());
+  assert(!m_actions_contexts.empty());
+
+  ActionContexts action_contexts(std::move(m_actions_contexts.front()));
+  m_actions_contexts.pop_front();
+  m_state = next_state;
+
+  m_lock.Unlock();
+  for (auto ctx : action_contexts.second) {
+    ctx->complete(r);
+  }
+  m_lock.Lock();
+
+  if (!is_transition_state() && !m_actions_contexts.empty()) {
+    execute_next_action();
+  }
+}
+
+template <typename I>
+void ImageState<I>::send_open() {
+  assert(m_lock.is_locked());
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_state = STATE_OPENING;
+
+  Context *ctx = create_context_callback<
+    ImageState<I>, &ImageState<I>::handle_open>(this);
+  image::OpenRequest<I> *req = image::OpenRequest<I>::create(
+    m_image_ctx, ctx);
+
+  m_lock.Unlock();
+  req->send();
+  m_lock.Lock();
+}
+
+template <typename I>
+void ImageState<I>::handle_open(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl;
+  }
+
+  Mutex::Locker locker(m_lock);
+  complete_action(r < 0 ? STATE_UNINITIALIZED : STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_close() {
+  assert(m_lock.is_locked());
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_state = STATE_CLOSING;
+
+  Context *ctx = create_context_callback<
+    ImageState<I>, &ImageState<I>::handle_close>(this);
+  image::CloseRequest<I> *req = image::CloseRequest<I>::create(
+    m_image_ctx, ctx);
+
+  m_lock.Unlock();
+  req->send();
+  m_lock.Lock();
+}
+
+template <typename I>
+void ImageState<I>::handle_close(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "error occurred while closing image: " << cpp_strerror(r)
+               << dendl;
+  }
+
+  Mutex::Locker locker(m_lock);
+  complete_action(STATE_CLOSED, r);
+}
+
+template <typename I>
+void ImageState<I>::send_refresh() {
+  assert(m_lock.is_locked());
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_state = STATE_REFRESHING;
+
+  Context *ctx = create_context_callback<
+    ImageState<I>, &ImageState<I>::handle_refresh>(this);
+  image::RefreshRequest<I> *req = image::RefreshRequest<I>::create(
+    *m_image_ctx, ctx);
+
+  m_lock.Unlock();
+  req->send();
+  m_lock.Lock();
+}
+
+template <typename I>
+void ImageState<I>::handle_refresh(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(!m_actions_contexts.empty());
+
+  ActionContexts &action_contexts(m_actions_contexts.front());
+  assert(action_contexts.first.action_type == ACTION_TYPE_REFRESH);
+  assert(m_last_refresh <= action_contexts.first.refresh_seq);
+  m_last_refresh = action_contexts.first.refresh_seq;
+
+  complete_action(STATE_OPEN, r);
+}
+
+template <typename I>
+void ImageState<I>::send_set_snap() {
+  assert(m_lock.is_locked());
+
+  m_state = STATE_SETTING_SNAP;
+
+  assert(!m_actions_contexts.empty());
+  ActionContexts &action_contexts(m_actions_contexts.front());
+  assert(action_contexts.first.action_type == ACTION_TYPE_SET_SNAP);
+
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": "
+                 << "snap_name=" << action_contexts.first.snap_name << dendl;
+
+  Context *ctx = create_context_callback<
+    ImageState<I>, &ImageState<I>::handle_set_snap>(this);
+  image::SetSnapRequest<I> *req = image::SetSnapRequest<I>::create(
+    *m_image_ctx, action_contexts.first.snap_name, ctx);
+
+  m_lock.Unlock();
+  req->send();
+  m_lock.Lock();
+}
+
+template <typename I>
+void ImageState<I>::handle_set_snap(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << " r=" << r << dendl;
+
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "failed to set snapshot: " << cpp_strerror(r) << dendl;
+  }
+
+  Mutex::Locker locker(m_lock);
+  complete_action(STATE_OPEN, r);
+}
+
+} // namespace librbd
+
+template class librbd::ImageState<librbd::ImageCtx>;
diff --git a/src/librbd/ImageState.h b/src/librbd/ImageState.h
new file mode 100644
index 0000000..f6a170b
--- /dev/null
+++ b/src/librbd/ImageState.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_STATE_H
+#define CEPH_LIBRBD_IMAGE_STATE_H
+
+#include "include/int_types.h"
+#include "common/Mutex.h"
+#include <list>
+#include <string>
+#include <utility>
+
+class Context;
+class RWLock;
+
+namespace librbd {
+
+class ImageCtx;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageState {
+public:
+  ImageState(ImageCtxT *image_ctx);
+  ~ImageState();
+
+  int open();
+  void open(Context *on_finish);
+
+  int close();
+  void close(Context *on_finish);
+
+  void handle_update_notification();
+
+  bool is_refresh_required() const;
+
+  int refresh();
+  void refresh(Context *on_finish);
+  int refresh_if_required();
+  int refresh_if_required(const RWLock &owner_lock);
+
+  void snap_set(const std::string &snap_name, Context *on_finish);
+
+private:
+  enum State {
+    STATE_UNINITIALIZED,
+    STATE_OPEN,
+    STATE_CLOSED,
+    STATE_OPENING,
+    STATE_CLOSING,
+    STATE_REFRESHING,
+    STATE_SETTING_SNAP
+  };
+
+  enum ActionType {
+    ACTION_TYPE_OPEN,
+    ACTION_TYPE_CLOSE,
+    ACTION_TYPE_REFRESH,
+    ACTION_TYPE_SET_SNAP
+  };
+
+  struct Action {
+    ActionType action_type;
+    uint64_t refresh_seq;
+    std::string snap_name;
+
+    Action(ActionType action_type) : action_type(action_type), refresh_seq(0) {
+    }
+    inline bool operator==(const Action &action) const {
+      if (action_type != action.action_type) {
+        return false;
+      }
+      switch (action_type) {
+      case ACTION_TYPE_REFRESH:
+        return refresh_seq == action.refresh_seq;
+      case ACTION_TYPE_SET_SNAP:
+        return snap_name == action.snap_name;
+      default:
+        return true;
+      }
+    }
+  };
+
+  typedef std::list<Context *> Contexts;
+  typedef std::pair<Action, Contexts> ActionContexts;
+  typedef std::list<ActionContexts> ActionsContexts;
+
+  ImageCtxT *m_image_ctx;
+  State m_state;
+
+  mutable Mutex m_lock;
+  ActionsContexts m_actions_contexts;
+
+  uint64_t m_last_refresh;
+  uint64_t m_refresh_seq;
+
+  bool is_transition_state() const;
+  bool is_closed() const;
+
+  void append_context(const Action &action, Context *context);
+  void execute_next_action();
+  void execute_action(const Action &action, Context *context);
+  void complete_action(State next_state, int r);
+
+  void send_open();
+  void handle_open(int r);
+
+  void send_close();
+  void handle_close(int r);
+
+  void send_refresh();
+  void handle_refresh(int r);
+
+  void send_set_snap();
+  void handle_set_snap(int r);
+
+};
+
+} // namespace librbd
+
+extern template class librbd::ImageState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_STATE_H
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
index 014ce94..ea3dbe0 100644
--- a/src/librbd/ImageWatcher.cc
+++ b/src/librbd/ImageWatcher.cc
@@ -2,12 +2,13 @@
 // vim: ts=8 sw=2 smarttab
 #include "librbd/ImageWatcher.h"
 #include "librbd/AioCompletion.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
 #include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/TaskFinisher.h"
-#include "cls/lock/cls_lock_client.h"
-#include "cls/lock/cls_lock_types.h"
+#include "librbd/Utils.h"
 #include "include/encoding.h"
 #include "include/stringify.h"
 #include "common/errno.h"
@@ -22,25 +23,19 @@
 
 namespace librbd {
 
-using namespace WatchNotify;
-
-static const std::string WATCHER_LOCK_TAG = "internal";
-static const std::string WATCHER_LOCK_COOKIE_PREFIX = "auto";
+using namespace watch_notify;
 
 static const uint64_t	NOTIFY_TIMEOUT = 5000;
 static const double	RETRY_DELAY_SECONDS = 1.0;
 
 ImageWatcher::ImageWatcher(ImageCtx &image_ctx)
   : m_image_ctx(image_ctx),
-    m_watch_lock(unique_lock_name("librbd::ImageWatcher::m_watch_lock", this)),
+    m_watch_lock(util::unique_lock_name("librbd::ImageWatcher::m_watch_lock", this)),
     m_watch_ctx(*this), m_watch_handle(0),
-    m_watch_state(WATCH_STATE_UNREGISTERED), m_lock_supported(false),
-    m_lock_owner_state(LOCK_OWNER_STATE_NOT_LOCKED),
-    m_listeners_lock(unique_lock_name("librbd::ImageWatcher::m_listeners_lock", this)),
-    m_listeners_in_use(false),
+    m_watch_state(WATCH_STATE_UNREGISTERED),
     m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)),
-    m_async_request_lock(unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)),
-    m_owner_client_id_lock(unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))
+    m_async_request_lock(util::unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)),
+    m_owner_client_id_lock(util::unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))
 {
 }
 
@@ -51,42 +46,6 @@ ImageWatcher::~ImageWatcher()
     RWLock::RLocker l(m_watch_lock);
     assert(m_watch_state != WATCH_STATE_REGISTERED);
   }
-  {
-    RWLock::RLocker l(m_image_ctx.owner_lock);
-    assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
-  }
-}
-
-bool ImageWatcher::is_lock_supported() const {
-  RWLock::RLocker l(m_image_ctx.snap_lock);
-  return is_lock_supported(m_image_ctx.snap_lock);
-}
-
-bool ImageWatcher::is_lock_supported(const RWLock &) const {
-  assert(m_image_ctx.owner_lock.is_locked());
-  assert(m_image_ctx.snap_lock.is_locked());
-  return ((m_image_ctx.features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
-	  !m_image_ctx.read_only && m_image_ctx.snap_id == CEPH_NOSNAP);
-}
-
-bool ImageWatcher::is_lock_owner() const {
-  assert(m_image_ctx.owner_lock.is_locked());
-  return (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED ||
-          m_lock_owner_state == LOCK_OWNER_STATE_RELEASING);
-}
-
-void ImageWatcher::register_listener(Listener *listener) {
-  Mutex::Locker listeners_locker(m_listeners_lock);
-  m_listeners.push_back(listener);
-}
-
-void ImageWatcher::unregister_listener(Listener *listener) {
-  // TODO CoW listener list
-  Mutex::Locker listeners_locker(m_listeners_lock);
-  while (m_listeners_in_use) {
-    m_listeners_cond.Wait(m_listeners_lock);
-  }
-  m_listeners.remove(listener);
 }
 
 int ImageWatcher::register_watch() {
@@ -125,325 +84,6 @@ int ImageWatcher::unregister_watch() {
   return r;
 }
 
-int ImageWatcher::refresh() {
-  assert(m_image_ctx.owner_lock.is_locked());
-
-  bool lock_support_changed = false;
-  {
-    RWLock::WLocker watch_locker(m_watch_lock);
-    if (m_lock_supported != is_lock_supported()) {
-      m_lock_supported = is_lock_supported();
-      lock_support_changed = true;
-    }
-  }
-
-  int r = 0;
-  if (lock_support_changed) {
-    if (is_lock_supported()) {
-      // image opened, exclusive lock dynamically enabled, or now HEAD
-      notify_listeners_updated_lock(LOCK_UPDATE_STATE_RELEASING);
-      notify_listeners_updated_lock(LOCK_UPDATE_STATE_UNLOCKED);
-    } else if (!is_lock_supported()) {
-      if (is_lock_owner()) {
-        // exclusive lock dynamically disabled or now snapshot
-        m_image_ctx.owner_lock.put_read();
-        {
-          RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
-          r = release_lock();
-        }
-        m_image_ctx.owner_lock.get_read();
-      }
-      notify_listeners_updated_lock(LOCK_UPDATE_STATE_NOT_SUPPORTED);
-    }
-  }
-  return r;
-}
-
-int ImageWatcher::try_lock() {
-  assert(m_image_ctx.owner_lock.is_wlocked());
-  assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
-  assert(is_lock_supported());
-
-  while (true) {
-    int r = lock();
-    if (r != -EBUSY) {
-      return r;
-    }
-
-    // determine if the current lock holder is still alive
-    entity_name_t locker;
-    std::string locker_cookie;
-    std::string locker_address;
-    uint64_t locker_handle;
-    r = get_lock_owner_info(&locker, &locker_cookie, &locker_address,
-			    &locker_handle);
-    if (r < 0) {
-      return r;
-    }
-    if (locker_cookie.empty() || locker_address.empty()) {
-      // lock is now unlocked ... try again
-      continue;
-    }
-
-    std::list<obj_watch_t> watchers;
-    r = m_image_ctx.md_ctx.list_watchers(m_image_ctx.header_oid, &watchers);
-    if (r < 0) {
-      return r;
-    }
-
-    for (std::list<obj_watch_t>::iterator iter = watchers.begin();
-	 iter != watchers.end(); ++iter) {
-      if ((strncmp(locker_address.c_str(),
-                   iter->addr, sizeof(iter->addr)) == 0) &&
-	  (locker_handle == iter->cookie)) {
-	Mutex::Locker l(m_owner_client_id_lock);
-        set_owner_client_id(ClientId(iter->watcher_id, locker_handle));
-	return 0;
-      }
-    }
-
-    if (m_image_ctx.blacklist_on_break_lock) {
-      ldout(m_image_ctx.cct, 1) << this << " blacklisting client: " << locker
-                                << "@" << locker_address << dendl;
-      librados::Rados rados(m_image_ctx.md_ctx);
-      r = rados.blacklist_add(locker_address,
-			      m_image_ctx.blacklist_expire_seconds);
-      if (r < 0) {
-        lderr(m_image_ctx.cct) << this << " unable to blacklist client: "
-			       << cpp_strerror(r) << dendl;
-        return r;
-      }
-    }
-
-    ldout(m_image_ctx.cct, 5) << this << " breaking exclusive lock: " << locker
-                              << dendl;
-    r = rados::cls::lock::break_lock(&m_image_ctx.md_ctx,
-                                     m_image_ctx.header_oid, RBD_LOCK_NAME,
-                                     locker_cookie, locker);
-    if (r < 0 && r != -ENOENT) {
-      return r;
-    }
-  }
-  return 0;
-}
-
-void ImageWatcher::request_lock() {
-  schedule_request_lock(false);
-}
-
-bool ImageWatcher::try_request_lock() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  if (is_lock_owner()) {
-    return true;
-  }
-
-  int r = 0;
-  m_image_ctx.owner_lock.put_read();
-  {
-    RWLock::WLocker l(m_image_ctx.owner_lock);
-    if (!is_lock_owner()) {
-      r = try_lock();
-    }
-  }
-  m_image_ctx.owner_lock.get_read();
-
-  if (r < 0) {
-    ldout(m_image_ctx.cct, 5) << this << " failed to acquire exclusive lock:"
-			      << cpp_strerror(r) << dendl;
-    return false;
-  }
-
-  if (is_lock_owner()) {
-    ldout(m_image_ctx.cct, 15) << this << " successfully acquired exclusive lock"
-			       << dendl;
-  } else {
-    ldout(m_image_ctx.cct, 15) << this
-                               << " unable to acquire exclusive lock, retrying"
-                               << dendl;
-  }
-  return is_lock_owner();
-}
-
-int ImageWatcher::get_lock_owner_info(entity_name_t *locker, std::string *cookie,
-				      std::string *address, uint64_t *handle) {
-  std::map<rados::cls::lock::locker_id_t,
-	   rados::cls::lock::locker_info_t> lockers;
-  ClsLockType lock_type;
-  std::string lock_tag;
-  int r = rados::cls::lock::get_lock_info(&m_image_ctx.md_ctx,
-					  m_image_ctx.header_oid,
-					  RBD_LOCK_NAME, &lockers, &lock_type,
-					  &lock_tag);
-  if (r < 0) {
-    return r;
-  }
-
-  if (lockers.empty()) {
-    ldout(m_image_ctx.cct, 20) << this << " no lockers detected" << dendl;
-    return 0;
-  }
-
-  if (lock_tag != WATCHER_LOCK_TAG) {
-    ldout(m_image_ctx.cct, 5) << this << " locked by external mechanism: tag="
-			      << lock_tag << dendl;
-    return -EBUSY;
-  }
-
-  if (lock_type == LOCK_SHARED) {
-    ldout(m_image_ctx.cct, 5) << this << " shared lock type detected" << dendl;
-    return -EBUSY;
-  }
-
-  std::map<rados::cls::lock::locker_id_t,
-           rados::cls::lock::locker_info_t>::iterator iter = lockers.begin();
-  if (!decode_lock_cookie(iter->first.cookie, handle)) {
-    ldout(m_image_ctx.cct, 5) << this << " locked by external mechanism: "
-                              << "cookie=" << iter->first.cookie << dendl;
-    return -EBUSY;
-  }
-
-  *locker = iter->first.locker;
-  *cookie = iter->first.cookie;
-  *address = stringify(iter->second.addr);
-  ldout(m_image_ctx.cct, 10) << this << " retrieved exclusive locker: "
-                             << *locker << "@" << *address << dendl;
-  return 0;
-}
-
-int ImageWatcher::lock() {
-  assert(m_image_ctx.owner_lock.is_wlocked());
-  assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
-
-  int r = rados::cls::lock::lock(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
-				 RBD_LOCK_NAME, LOCK_EXCLUSIVE,
-				 encode_lock_cookie(), WATCHER_LOCK_TAG, "",
-				 utime_t(), 0);
-  if (r < 0) {
-    return r;
-  }
-
-  ldout(m_image_ctx.cct, 10) << this << " acquired exclusive lock" << dendl;
-  m_lock_owner_state = LOCK_OWNER_STATE_LOCKED;
-
-  ClientId owner_client_id = get_client_id();
-  {
-    Mutex::Locker l(m_owner_client_id_lock);
-    set_owner_client_id(owner_client_id);
-  }
-
-  if (m_image_ctx.object_map.enabled()) {
-    r = m_image_ctx.object_map.lock();
-    if (r < 0 && r != -ENOENT) {
-      unlock();
-      return r;
-    }
-    RWLock::WLocker l2(m_image_ctx.snap_lock);
-    m_image_ctx.object_map.refresh(CEPH_NOSNAP);
-  }
-
-  // send the notification when we aren't holding locks
-  FunctionContext *ctx = new FunctionContext(
-    boost::bind(&ImageWatcher::notify_acquired_lock, this));
-  m_task_finisher->queue(TASK_CODE_ACQUIRED_LOCK, ctx);
-  return 0;
-}
-
-int ImageWatcher::unlock()
-{
-  assert(m_image_ctx.owner_lock.is_wlocked());
-
-  ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock" << dendl;
-  m_lock_owner_state = LOCK_OWNER_STATE_NOT_LOCKED;
-  int r = rados::cls::lock::unlock(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
-				   RBD_LOCK_NAME, encode_lock_cookie());
-  if (r < 0 && r != -ENOENT) {
-    lderr(m_image_ctx.cct) << this << " failed to release exclusive lock: "
-			   << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  if (m_image_ctx.object_map.enabled()) {
-    m_image_ctx.object_map.unlock();
-  }
-
-  {
-    Mutex::Locker l(m_owner_client_id_lock);
-    set_owner_client_id(ClientId());
-  }
-
-  FunctionContext *ctx = new FunctionContext(
-    boost::bind(&ImageWatcher::notify_released_lock, this));
-  m_task_finisher->queue(TASK_CODE_RELEASED_LOCK, ctx);
-  return 0;
-}
-
-int ImageWatcher::release_lock()
-{
-  assert(m_image_ctx.owner_lock.is_wlocked());
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 10) << this << " releasing exclusive lock by request" << dendl;
-  if (m_lock_owner_state != LOCK_OWNER_STATE_LOCKED) {
-    return 0;
-  }
-
-  m_lock_owner_state = LOCK_OWNER_STATE_RELEASING;
-  m_image_ctx.owner_lock.put_write();
-
-  // ensure all maint operations are canceled
-  m_image_ctx.cancel_async_requests();
-  m_image_ctx.flush_async_operations();
-
-  int r;
-  {
-    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-
-    // alert listeners that all incoming IO needs to be stopped since the
-    // lock is being released
-    notify_listeners_updated_lock(LOCK_UPDATE_STATE_RELEASING);
-
-    RWLock::WLocker md_locker(m_image_ctx.md_lock);
-    r = m_image_ctx.flush();
-    if (r < 0) {
-      lderr(cct) << this << " failed to flush: " << cpp_strerror(r) << dendl;
-      goto err_cancel_unlock;
-    }
-  }
-
-  m_image_ctx.owner_lock.get_write();
-  assert(m_lock_owner_state == LOCK_OWNER_STATE_RELEASING);
-  r = unlock();
-
-  // notify listeners of the change w/ owner read locked
-  m_image_ctx.owner_lock.put_write();
-  {
-    RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
-    if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-      notify_listeners_updated_lock(LOCK_UPDATE_STATE_UNLOCKED);
-    }
-  }
-  m_image_ctx.owner_lock.get_write();
-
-  if (r < 0) {
-    lderr(cct) << this << " failed to unlock: " << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  return 0;
-
-err_cancel_unlock:
-  m_image_ctx.owner_lock.get_write();
-  if (m_lock_owner_state == LOCK_OWNER_STATE_RELEASING) {
-    m_lock_owner_state = LOCK_OWNER_STATE_LOCKED;
-  }
-  return r;
-}
-
-void ImageWatcher::assert_header_locked(librados::ObjectWriteOperation *op) {
-  rados::cls::lock::assert_locked(op, RBD_LOCK_NAME, LOCK_EXCLUSIVE,
-                                  encode_lock_cookie(), WATCHER_LOCK_TAG);
-}
-
 void ImageWatcher::schedule_async_progress(const AsyncRequestId &request,
 					   uint64_t offset, uint64_t total) {
   FunctionContext *ctx = new FunctionContext(
@@ -501,7 +141,8 @@ int ImageWatcher::notify_async_complete(const AsyncRequestId &request,
 
 int ImageWatcher::notify_flatten(uint64_t request_id, ProgressContext &prog_ctx) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(!is_lock_owner());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   AsyncRequestId async_request_id(get_client_id(), request_id);
 
@@ -514,7 +155,8 @@ int ImageWatcher::notify_flatten(uint64_t request_id, ProgressContext &prog_ctx)
 int ImageWatcher::notify_resize(uint64_t request_id, uint64_t size,
 				ProgressContext &prog_ctx) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(!is_lock_owner());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   AsyncRequestId async_request_id(get_client_id(), request_id);
 
@@ -526,7 +168,8 @@ int ImageWatcher::notify_resize(uint64_t request_id, uint64_t size,
 
 int ImageWatcher::notify_snap_create(const std::string &snap_name) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(!is_lock_owner());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   bufferlist bl;
   ::encode(NotifyMessage(SnapCreatePayload(snap_name)), bl);
@@ -537,7 +180,8 @@ int ImageWatcher::notify_snap_create(const std::string &snap_name) {
 int ImageWatcher::notify_snap_rename(const snapid_t &src_snap_id,
 				     const std::string &dst_snap_name) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(!is_lock_owner());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   bufferlist bl;
   ::encode(NotifyMessage(SnapRenamePayload(src_snap_id, dst_snap_name)), bl);
@@ -546,7 +190,8 @@ int ImageWatcher::notify_snap_rename(const snapid_t &src_snap_id,
 }
 int ImageWatcher::notify_snap_remove(const std::string &snap_name) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(!is_lock_owner());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   bufferlist bl;
   ::encode(NotifyMessage(SnapRemovePayload(snap_name)), bl);
@@ -554,10 +199,31 @@ int ImageWatcher::notify_snap_remove(const std::string &snap_name) {
   return notify_lock_owner(bl);
 }
 
+int ImageWatcher::notify_snap_protect(const std::string &snap_name) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
+
+  bufferlist bl;
+  ::encode(NotifyMessage(SnapProtectPayload(snap_name)), bl);
+  return notify_lock_owner(bl);
+}
+
+int ImageWatcher::notify_snap_unprotect(const std::string &snap_name) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
+
+  bufferlist bl;
+  ::encode(NotifyMessage(SnapUnprotectPayload(snap_name)), bl);
+  return notify_lock_owner(bl);
+}
+
 int ImageWatcher::notify_rebuild_object_map(uint64_t request_id,
                                             ProgressContext &prog_ctx) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(!is_lock_owner());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   AsyncRequestId async_request_id(get_client_id(), request_id);
 
@@ -567,19 +233,14 @@ int ImageWatcher::notify_rebuild_object_map(uint64_t request_id,
   return notify_async_request(async_request_id, bl, prog_ctx);
 }
 
-void ImageWatcher::notify_lock_state() {
-  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-    // re-send the acquired lock notification so that peers know they can now
-    // request the lock
-    ldout(m_image_ctx.cct, 10) << this << " notify lock state" << dendl;
-
-    bufferlist bl;
-    ::encode(NotifyMessage(AcquiredLockPayload(get_client_id())), bl);
+int ImageWatcher::notify_rename(const std::string &image_name) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
-    m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT,
-                               NULL);
-  }
+  bufferlist bl;
+  ::encode(NotifyMessage(RenamePayload(image_name)), bl);
+  return notify_lock_owner(bl);
 }
 
 void ImageWatcher::notify_header_update(librados::IoCtx &io_ctx,
@@ -592,23 +253,6 @@ void ImageWatcher::notify_header_update(librados::IoCtx &io_ctx,
   io_ctx.notify2(oid, bl, NOTIFY_TIMEOUT, NULL);
 }
 
-std::string ImageWatcher::encode_lock_cookie() const {
-  RWLock::RLocker l(m_watch_lock);
-  std::ostringstream ss;
-  ss << WATCHER_LOCK_COOKIE_PREFIX << " " << m_watch_handle;
-  return ss.str();
-}
-
-bool ImageWatcher::decode_lock_cookie(const std::string &tag,
-				      uint64_t *handle) {
-  std::string prefix;
-  std::istringstream ss(tag);
-  if (!(ss >> prefix >> *handle) || prefix != WATCHER_LOCK_COOKIE_PREFIX) {
-    return false;
-  }
-  return true;
-}
-
 void ImageWatcher::schedule_cancel_async_requests() {
   FunctionContext *ctx = new FunctionContext(
     boost::bind(&ImageWatcher::cancel_async_requests, this));
@@ -625,7 +269,7 @@ void ImageWatcher::cancel_async_requests() {
   m_async_requests.clear();
 }
 
-void ImageWatcher::set_owner_client_id(const WatchNotify::ClientId& client_id) {
+void ImageWatcher::set_owner_client_id(const ClientId& client_id) {
   assert(m_owner_client_id_lock.is_locked());
   m_owner_client_id = client_id;
   ldout(m_image_ctx.cct, 10) << this << " current lock owner: "
@@ -640,25 +284,25 @@ ClientId ImageWatcher::get_client_id() {
 void ImageWatcher::notify_acquired_lock() {
   ldout(m_image_ctx.cct, 10) << this << " notify acquired lock" << dendl;
 
-  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  if (m_lock_owner_state != LOCK_OWNER_STATE_LOCKED) {
-    return;
+  ClientId client_id = get_client_id();
+  {
+    Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
+    set_owner_client_id(client_id);
   }
 
-  notify_listeners_updated_lock(LOCK_UPDATE_STATE_LOCKED);
-
   bufferlist bl;
-  ::encode(NotifyMessage(AcquiredLockPayload(get_client_id())), bl);
+  ::encode(NotifyMessage(AcquiredLockPayload(client_id)), bl);
   m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT, NULL);
 }
 
-void ImageWatcher::notify_release_lock() {
-  RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
-  release_lock();
-}
-
 void ImageWatcher::notify_released_lock() {
   ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl;
+
+  {
+    Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
+    set_owner_client_id(ClientId());
+  }
+
   bufferlist bl;
   ::encode(NotifyMessage(ReleasedLockPayload(get_client_id())), bl);
   m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT, NULL);
@@ -666,7 +310,13 @@ void ImageWatcher::notify_released_lock() {
 
 void ImageWatcher::schedule_request_lock(bool use_timer, int timer_delay) {
   assert(m_image_ctx.owner_lock.is_locked());
-  assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
+
+  if (m_image_ctx.exclusive_lock == nullptr) {
+    // exclusive lock dynamically disabled via image refresh
+    return;
+  }
+  assert(m_image_ctx.exclusive_lock &&
+         !m_image_ctx.exclusive_lock->is_lock_owner());
 
   RWLock::RLocker watch_locker(m_watch_lock);
   if (m_watch_state == WATCH_STATE_REGISTERED) {
@@ -687,12 +337,8 @@ void ImageWatcher::schedule_request_lock(bool use_timer, int timer_delay) {
 }
 
 void ImageWatcher::notify_request_lock() {
-  ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
-
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  if (try_request_lock()) {
-    return;
-  }
+  ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
 
   bufferlist bl;
   ::encode(NotifyMessage(RequestLockPayload(get_client_id())), bl);
@@ -701,7 +347,9 @@ void ImageWatcher::notify_request_lock() {
   if (r == -ETIMEDOUT) {
     ldout(m_image_ctx.cct, 5) << this << " timed out requesting lock: retrying"
                               << dendl;
-    schedule_request_lock(false);
+
+    // treat this is a dead client -- so retest acquiring the lock
+    m_image_ctx.exclusive_lock->handle_lock_released();
   } else if (r < 0) {
     lderr(m_image_ctx.cct) << this << " error requesting lock: "
                            << cpp_strerror(r) << dendl;
@@ -725,6 +373,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
   int r = m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT,
 				     &response_bl);
   m_image_ctx.owner_lock.get_read();
+
   if (r < 0 && r != -ETIMEDOUT) {
     lderr(m_image_ctx.cct) << this << " lock owner notification failed: "
 			   << cpp_strerror(r) << dendl;
@@ -845,31 +494,23 @@ int ImageWatcher::prepare_async_request(const AsyncRequestId& async_request_id,
   return 0;
 }
 
-void ImageWatcher::cleanup_async_request(const AsyncRequestId& async_request_id,
-                                         Context *ctx) {
-  delete ctx;
-
-  RWLock::WLocker l(m_async_request_lock);
-  m_async_pending.erase(async_request_id);
-}
-
-void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
-				  bufferlist *out) {
+bool ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
+				  C_NotifyAck *ack_ctx) {
   ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl;
 
-  Mutex::Locker lictx(m_image_ctx.refresh_lock);
-  ++m_image_ctx.refresh_seq;
+  m_image_ctx.state->handle_update_notification();
   m_image_ctx.perfcounter->inc(l_librbd_notify);
+  return true;
 }
 
-void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
-                                  bufferlist *out) {
+bool ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
+                                  C_NotifyAck *ack_ctx) {
   ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement"
                              << dendl;
 
   bool cancel_async_requests = true;
   if (payload.client_id.is_valid()) {
-    Mutex::Locker l(m_owner_client_id_lock);
+    Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
     if (payload.client_id == m_owner_client_id) {
       cancel_async_requests = false;
     }
@@ -877,16 +518,16 @@ void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
   }
 
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-    if (cancel_async_requests) {
-      schedule_cancel_async_requests();
-    }
-    notify_listeners_updated_lock(LOCK_UPDATE_STATE_NOTIFICATION);
+  if (cancel_async_requests &&
+      (m_image_ctx.exclusive_lock == nullptr ||
+       !m_image_ctx.exclusive_lock->is_lock_owner())) {
+    schedule_cancel_async_requests();
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
-                                  bufferlist *out) {
+bool ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
+                                  C_NotifyAck *ack_ctx) {
   ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl;
 
   bool cancel_async_requests = true;
@@ -903,57 +544,50 @@ void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
   }
 
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-    if (cancel_async_requests) {
-      schedule_cancel_async_requests();
-    }
-    notify_listeners_updated_lock(LOCK_UPDATE_STATE_NOTIFICATION);
+  if (cancel_async_requests &&
+      (m_image_ctx.exclusive_lock == nullptr ||
+       !m_image_ctx.exclusive_lock->is_lock_owner())) {
+    schedule_cancel_async_requests();
   }
+
+  // alert the exclusive lock state machine that the lock is available
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      !m_image_ctx.exclusive_lock->is_lock_owner()) {
+    m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK);
+    m_image_ctx.exclusive_lock->handle_lock_released();
+  }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const RequestLockPayload &payload,
-                                  bufferlist *out) {
+bool ImageWatcher::handle_payload(const RequestLockPayload &payload,
+                                  C_NotifyAck *ack_ctx) {
   ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl;
   if (payload.client_id == get_client_id()) {
-    return;
+    return true;
   }
 
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     // need to send something back so the client can detect a missing leader
-    ::encode(ResponseMessage(0), *out);
+    ::encode(ResponseMessage(0), ack_ctx->out);
 
     {
-      Mutex::Locker l(m_owner_client_id_lock);
+      Mutex::Locker owner_client_id_locker(m_owner_client_id_lock);
       if (!m_owner_client_id.is_valid()) {
-	return;
-      }
-    }
-
-    bool release_permitted = true;
-    {
-      Mutex::Locker listeners_locker(m_listeners_lock);
-      for (Listeners::iterator it = m_listeners.begin();
-           it != m_listeners.end(); ++it) {
-        if (!(*it)->handle_requested_lock()) {
-          release_permitted = false;
-          break;
-        }
+	return true;
       }
     }
 
-    if (release_permitted) {
-      ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
-                                 << dendl;
-      FunctionContext *ctx = new FunctionContext(
-        boost::bind(&ImageWatcher::notify_release_lock, this));
-      m_task_finisher->queue(TASK_CODE_RELEASING_LOCK, ctx);
-    }
+    ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
+                               << dendl;
+    m_image_ctx.exclusive_lock->release_lock(nullptr);
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const AsyncProgressPayload &payload,
-                                  bufferlist *out) {
+bool ImageWatcher::handle_payload(const AsyncProgressPayload &payload,
+                                  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_async_request_lock);
   std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
     m_async_requests.find(payload.async_request_id);
@@ -965,10 +599,11 @@ void ImageWatcher::handle_payload(const AsyncProgressPayload &payload,
     schedule_async_request_timed_out(payload.async_request_id);
     req_it->second.second->update_progress(payload.offset, payload.total);
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const AsyncCompletePayload &payload,
-                                  bufferlist *out) {
+bool ImageWatcher::handle_payload(const AsyncCompletePayload &payload,
+                                  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_async_request_lock);
   std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
     m_async_requests.find(payload.async_request_id);
@@ -978,13 +613,15 @@ void ImageWatcher::handle_payload(const AsyncCompletePayload &payload,
 			       << payload.result << dendl;
     req_it->second.first->complete(payload.result);
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const FlattenPayload &payload,
-				  bufferlist *out) {
+bool ImageWatcher::handle_payload(const FlattenPayload &payload,
+				  C_NotifyAck *ack_ctx) {
 
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     bool new_request;
     Context *ctx;
     ProgressContext *prog_ctx;
@@ -993,22 +630,19 @@ void ImageWatcher::handle_payload(const FlattenPayload &payload,
     if (new_request) {
       ldout(m_image_ctx.cct, 10) << this << " remote flatten request: "
 				 << payload.async_request_id << dendl;
-      r = librbd::async_flatten(&m_image_ctx, ctx, *prog_ctx);
-      if (r < 0) {
-	lderr(m_image_ctx.cct) << this << " remove flatten request failed: "
-			       << cpp_strerror(r) << dendl;
-        cleanup_async_request(payload.async_request_id, ctx);
-      }
+      librbd::async_flatten(&m_image_ctx, ctx, *prog_ctx);
     }
 
-    ::encode(ResponseMessage(r), *out);
+    ::encode(ResponseMessage(r), ack_ctx->out);
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const ResizePayload &payload,
-				  bufferlist *out) {
+bool ImageWatcher::handle_payload(const ResizePayload &payload,
+				  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     bool new_request;
     Context *ctx;
     ProgressContext *prog_ctx;
@@ -1018,62 +652,95 @@ void ImageWatcher::handle_payload(const ResizePayload &payload,
       ldout(m_image_ctx.cct, 10) << this << " remote resize request: "
 				 << payload.async_request_id << " "
 				 << payload.size << dendl;
-      r = librbd::async_resize(&m_image_ctx, ctx, payload.size, *prog_ctx);
-      if (r < 0) {
-	lderr(m_image_ctx.cct) << this << " remove resize request failed: "
-			       << cpp_strerror(r) << dendl;
-        cleanup_async_request(payload.async_request_id, ctx);
-      }
+      librbd::async_resize(&m_image_ctx, ctx, payload.size, *prog_ctx);
     }
 
-    ::encode(ResponseMessage(r), *out);
+    ::encode(ResponseMessage(r), ack_ctx->out);
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const SnapCreatePayload &payload,
-				  bufferlist *out) {
+bool ImageWatcher::handle_payload(const SnapCreatePayload &payload,
+				  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: "
 			       << payload.snap_name << dendl;
-    int r = librbd::snap_create_helper(&m_image_ctx, NULL,
-                                       payload.snap_name.c_str());
 
-    ::encode(ResponseMessage(r), *out);
+    librbd::snap_create_helper(&m_image_ctx, new C_ResponseMessage(ack_ctx),
+                               payload.snap_name.c_str());
+    return false;
   }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const SnapRenamePayload &payload,
-				  bufferlist *out) {
+bool ImageWatcher::handle_payload(const SnapRenamePayload &payload,
+				  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     ldout(m_image_ctx.cct, 10) << this << " remote snap_rename request: "
-			       << payload.src_snap_id << " to " 
-			       << payload.dst_snap_name << dendl;
-    int r = librbd::snap_rename_helper(&m_image_ctx, NULL,
-                                       payload.src_snap_id,
-                                       payload.dst_snap_name.c_str());
+			       << payload.snap_id << " to "
+			       << payload.snap_name << dendl;
 
-    ::encode(ResponseMessage(r), *out);
+    librbd::snap_rename_helper(&m_image_ctx, new C_ResponseMessage(ack_ctx),
+                               payload.snap_id, payload.snap_name.c_str());
+    return false;
   }
+  return true;
 }
-void ImageWatcher::handle_payload(const SnapRemovePayload &payload,
-				  bufferlist *out) {
+
+bool ImageWatcher::handle_payload(const SnapRemovePayload &payload,
+				  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     ldout(m_image_ctx.cct, 10) << this << " remote snap_remove request: "
 			       << payload.snap_name << dendl;
-    int r = librbd::snap_remove_helper(&m_image_ctx, NULL,
-                                       payload.snap_name.c_str());
 
-    ::encode(ResponseMessage(r), *out);
+    librbd::snap_remove_helper(&m_image_ctx, new C_ResponseMessage(ack_ctx),
+                               payload.snap_name.c_str());
+    return false;
+  }
+  return true;
+}
+
+bool ImageWatcher::handle_payload(const SnapProtectPayload& payload,
+                                  C_NotifyAck *ack_ctx) {
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
+    ldout(m_image_ctx.cct, 10) << this << " remote snap_protect request: "
+                               << payload.snap_name << dendl;
+
+    librbd::snap_protect_helper(&m_image_ctx, new C_ResponseMessage(ack_ctx),
+                                payload.snap_name.c_str());
+    return false;
   }
+  return true;
+}
+
+bool ImageWatcher::handle_payload(const SnapUnprotectPayload& payload,
+                                  C_NotifyAck *ack_ctx) {
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
+    ldout(m_image_ctx.cct, 10) << this << " remote snap_unprotect request: "
+                               << payload.snap_name << dendl;
+
+    librbd::snap_unprotect_helper(&m_image_ctx, new C_ResponseMessage(ack_ctx),
+                                  payload.snap_name.c_str());
+    return false;
+  }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const RebuildObjectMapPayload& payload,
-                                  bufferlist *out) {
+bool ImageWatcher::handle_payload(const RebuildObjectMapPayload& payload,
+                                  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
     bool new_request;
     Context *ctx;
     ProgressContext *prog_ctx;
@@ -1083,24 +750,46 @@ void ImageWatcher::handle_payload(const RebuildObjectMapPayload& payload,
       ldout(m_image_ctx.cct, 10) << this
                                  << " remote rebuild object map request: "
                                  << payload.async_request_id << dendl;
-      r = librbd::async_rebuild_object_map(&m_image_ctx, ctx, *prog_ctx);
-      if (r < 0) {
-        lderr(m_image_ctx.cct) << this
-                               << " remove rebuild object map request failed: "
-                               << cpp_strerror(r) << dendl;
-        cleanup_async_request(payload.async_request_id, ctx);
-      }
+      librbd::async_rebuild_object_map(&m_image_ctx, ctx, *prog_ctx);
     }
 
-    ::encode(ResponseMessage(0), *out);
+    ::encode(ResponseMessage(r), ack_ctx->out);
   }
+  return true;
+}
+
+bool ImageWatcher::handle_payload(const RenamePayload& payload,
+                                  C_NotifyAck *ack_ctx) {
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
+    ldout(m_image_ctx.cct, 10) << this << " remote rename request: "
+                               << payload.image_name << dendl;
+
+    librbd::rename_helper(&m_image_ctx, new C_ResponseMessage(ack_ctx),
+                          payload.image_name.c_str());
+    return false;
+  }
+  return true;
 }
 
-void ImageWatcher::handle_payload(const UnknownPayload &payload,
-				  bufferlist *out) {
+bool ImageWatcher::handle_payload(const UnknownPayload &payload,
+				  C_NotifyAck *ack_ctx) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
-  if (is_lock_owner()) {
-    ::encode(ResponseMessage(-EOPNOTSUPP), *out);
+  if (m_image_ctx.exclusive_lock != nullptr &&
+      m_image_ctx.exclusive_lock->accept_requests()) {
+    ::encode(ResponseMessage(-EOPNOTSUPP), ack_ctx->out);
+  }
+  return true;
+}
+
+void ImageWatcher::process_payload(uint64_t notify_id, uint64_t handle,
+                                   const Payload &payload, int r) {
+  if (r < 0) {
+    bufferlist out_bl;
+    acknowledge_notify(notify_id, handle, out_bl);
+  } else {
+    apply_visitor(HandlePayloadVisitor(this, notify_id, handle), payload);
   }
 }
 
@@ -1121,8 +810,13 @@ void ImageWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
     }
   }
 
-  apply_visitor(HandlePayloadVisitor(this, notify_id, handle),
-		notify_message.payload);
+  // if an image refresh is required, refresh before processing the request
+  if (m_image_ctx.state->is_refresh_required()) {
+    m_image_ctx.state->refresh(new C_ProcessPayload(this, notify_id, handle,
+                                                    notify_message.payload));
+  } else {
+    process_payload(notify_id, handle, notify_message.payload, 0);
+  }
 }
 
 void ImageWatcher::handle_error(uint64_t handle, int err) {
@@ -1153,14 +847,22 @@ void ImageWatcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
 void ImageWatcher::reregister_watch() {
   ldout(m_image_ctx.cct, 10) << this << " re-registering image watch" << dendl;
 
-  RWLock::WLocker l(m_image_ctx.owner_lock);
-  bool was_lock_owner = false;
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-    // ensure all async requests are canceled and IO is flushed
-    was_lock_owner = release_lock();
+  bool releasing_lock = false;
+  C_SaferCond release_lock_ctx;
+  {
+    RWLock::WLocker l(m_image_ctx.owner_lock);
+    if (m_image_ctx.exclusive_lock != nullptr) {
+      releasing_lock = true;
+      m_image_ctx.exclusive_lock->release_lock(&release_lock_ctx);
+    }
   }
 
   int r;
+  if (releasing_lock) {
+    r = release_lock_ctx.wait();
+    assert(r == 0);
+  }
+
   {
     RWLock::WLocker l(m_watch_lock);
     if (m_watch_state != WATCH_STATE_ERROR) {
@@ -1184,22 +886,6 @@ void ImageWatcher::reregister_watch() {
     m_watch_state = WATCH_STATE_REGISTERED;
   }
   handle_payload(HeaderUpdatePayload(), NULL);
-
-  if (was_lock_owner) {
-    r = try_lock();
-    if (r == -EBUSY) {
-      ldout(m_image_ctx.cct, 5) << this << "lost image lock while "
-                                << "re-registering image watch" << dendl;
-    } else if (r < 0) {
-      lderr(m_image_ctx.cct) << this
-                             << "failed to lock image while re-registering "
-                             << "image watch" << cpp_strerror(r) << dendl;
-    }
-  }
-
-  if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-    notify_listeners_updated_lock(LOCK_UPDATE_STATE_UNLOCKED);
-  }
 }
 
 void ImageWatcher::WatchCtx::handle_notify(uint64_t notify_id,
@@ -1217,25 +903,29 @@ void ImageWatcher::RemoteContext::finish(int r) {
   m_image_watcher.schedule_async_complete(m_async_request_id, r);
 }
 
-void ImageWatcher::notify_listeners_updated_lock(
-    LockUpdateState lock_update_state) {
-  assert(m_image_ctx.owner_lock.is_locked());
-
-  Listeners listeners;
-  {
-    Mutex::Locker listeners_locker(m_listeners_lock);
-    m_listeners_in_use = true;
-    listeners = m_listeners;
-  }
+ImageWatcher::C_NotifyAck::C_NotifyAck(ImageWatcher *image_watcher,
+                                       uint64_t notify_id, uint64_t handle)
+  : image_watcher(image_watcher), notify_id(notify_id), handle(handle) {
+  CephContext *cct = image_watcher->m_image_ctx.cct;
+  ldout(cct, 10) << this << " C_NotifyAck start: id=" << notify_id << ", "
+                 << "handle=" << handle << dendl;
+}
 
-  for (Listeners::iterator it = listeners.begin();
-       it != listeners.end(); ++it) {
-    (*it)->handle_lock_updated(lock_update_state);
-  }
+void ImageWatcher::C_NotifyAck::finish(int r) {
+  assert(r == 0);
+  CephContext *cct = image_watcher->m_image_ctx.cct;
+  ldout(cct, 10) << this << " C_NotifyAck finish: id=" << notify_id << ", "
+                 << "handle=" << handle << dendl;
 
-  Mutex::Locker listeners_locker(m_listeners_lock);
-  m_listeners_in_use = false;
-  m_listeners_cond.Signal();
+  image_watcher->acknowledge_notify(notify_id, handle, out);
 }
 
+void ImageWatcher::C_ResponseMessage::finish(int r) {
+  CephContext *cct = notify_ack->image_watcher->m_image_ctx.cct;
+  ldout(cct, 10) << this << " C_ResponseMessage: r=" << r << dendl;
+
+  ::encode(ResponseMessage(r), notify_ack->out);
+  notify_ack->complete(0);
 }
+
+} // namespace librbd
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
index 2b3da27..c82ca55 100644
--- a/src/librbd/ImageWatcher.h
+++ b/src/librbd/ImageWatcher.h
@@ -26,42 +26,12 @@ template <typename T> class TaskFinisher;
 
 class ImageWatcher {
 public:
-  enum LockUpdateState {
-    LOCK_UPDATE_STATE_NOT_SUPPORTED,
-    LOCK_UPDATE_STATE_LOCKED,
-    LOCK_UPDATE_STATE_RELEASING,
-    LOCK_UPDATE_STATE_UNLOCKED,
-    LOCK_UPDATE_STATE_NOTIFICATION
-  };
-
-  struct Listener {
-    virtual ~Listener() {}
-
-    virtual bool handle_requested_lock() = 0;
-    virtual void handle_lock_updated(LockUpdateState lock_update_state) = 0;
-  };
-
   ImageWatcher(ImageCtx& image_ctx);
   ~ImageWatcher();
 
-  bool is_lock_supported() const;
-  bool is_lock_supported(const RWLock &snap_lock) const;
-  bool is_lock_owner() const;
-
-  void register_listener(Listener *listener);
-  void unregister_listener(Listener *listener);
-
   int register_watch();
   int unregister_watch();
 
-  int refresh();
-
-  int try_lock();
-  void request_lock();
-  int release_lock();
-
-  void assert_header_locked(librados::ObjectWriteOperation *op);
-
   int notify_flatten(uint64_t request_id, ProgressContext &prog_ctx);
   int notify_resize(uint64_t request_id, uint64_t size,
                     ProgressContext &prog_ctx);
@@ -69,21 +39,25 @@ public:
   int notify_snap_rename(const snapid_t &src_snap_id,
                          const std::string &dst_snap_name);
   int notify_snap_remove(const std::string &snap_name);
+  int notify_snap_protect(const std::string &snap_name);
+  int notify_snap_unprotect(const std::string &snap_name);
   int notify_rebuild_object_map(uint64_t request_id,
                                 ProgressContext &prog_ctx);
+  int notify_rename(const std::string &image_name);
+
+  void notify_acquired_lock();
+  void notify_released_lock();
+  void notify_request_lock();
 
-  void notify_lock_state();
   static void notify_header_update(librados::IoCtx &io_ctx,
                                    const std::string &oid);
 
-private:
-
-  enum LockOwnerState {
-    LOCK_OWNER_STATE_NOT_LOCKED,
-    LOCK_OWNER_STATE_LOCKED,
-    LOCK_OWNER_STATE_RELEASING
-  };
+  uint64_t get_watch_handle() const {
+    RWLock::RLocker watch_locker(m_watch_lock);
+    return m_watch_handle;
+  }
 
+private:
   enum WatchState {
     WATCH_STATE_UNREGISTERED,
     WATCH_STATE_REGISTERED,
@@ -93,7 +67,6 @@ private:
   enum TaskCode {
     TASK_CODE_ACQUIRED_LOCK,
     TASK_CODE_REQUEST_LOCK,
-    TASK_CODE_RELEASING_LOCK,
     TASK_CODE_RELEASED_LOCK,
     TASK_CODE_CANCEL_ASYNC_REQUESTS,
     TASK_CODE_REREGISTER_WATCH,
@@ -101,13 +74,12 @@ private:
     TASK_CODE_ASYNC_PROGRESS
   };
 
-  typedef std::list<Listener *> Listeners;
   typedef std::pair<Context *, ProgressContext *> AsyncRequest;
 
   class Task {
   public:
     Task(TaskCode task_code) : m_task_code(task_code) {}
-    Task(TaskCode task_code, const WatchNotify::AsyncRequestId &id)
+    Task(TaskCode task_code, const watch_notify::AsyncRequestId &id)
       : m_task_code(task_code), m_async_request_id(id) {}
 
     inline bool operator<(const Task& rhs) const {
@@ -122,7 +94,7 @@ private:
     }
   private:
     TaskCode m_task_code;
-    WatchNotify::AsyncRequestId m_async_request_id;
+    watch_notify::AsyncRequestId m_async_request_id;
   };
 
   struct WatchCtx : public librados::WatchCtx2 {
@@ -140,7 +112,7 @@ private:
   class RemoteProgressContext : public ProgressContext {
   public:
     RemoteProgressContext(ImageWatcher &image_watcher,
-                          const WatchNotify::AsyncRequestId &id)
+                          const watch_notify::AsyncRequestId &id)
       : m_image_watcher(image_watcher), m_async_request_id(id)
     {
     }
@@ -153,14 +125,14 @@ private:
 
   private:
     ImageWatcher &m_image_watcher;
-    WatchNotify::AsyncRequestId m_async_request_id;
+    watch_notify::AsyncRequestId m_async_request_id;
   };
 
   class RemoteContext : public Context {
   public:
     RemoteContext(ImageWatcher &image_watcher,
-                  const WatchNotify::AsyncRequestId &id,
-                  ProgressContext *prog_ctx)
+      	          const watch_notify::AsyncRequestId &id,
+      	          ProgressContext *prog_ctx)
       : m_image_watcher(image_watcher), m_async_request_id(id),
         m_prog_ctx(prog_ctx)
     {
@@ -174,10 +146,46 @@ private:
 
   private:
     ImageWatcher &m_image_watcher;
-    WatchNotify::AsyncRequestId m_async_request_id;
+    watch_notify::AsyncRequestId m_async_request_id;
     ProgressContext *m_prog_ctx;
   };
 
+  struct C_NotifyAck : public Context {
+    ImageWatcher *image_watcher;
+    uint64_t notify_id;
+    uint64_t handle;
+    bufferlist out;
+
+    C_NotifyAck(ImageWatcher *image_watcher, uint64_t notify_id,
+                uint64_t handle);
+    virtual void finish(int r);
+  };
+
+  struct C_ResponseMessage : public Context {
+    C_NotifyAck *notify_ack;
+
+    C_ResponseMessage(C_NotifyAck *notify_ack) : notify_ack(notify_ack) {
+    }
+    virtual void finish(int r);
+  };
+
+  struct C_ProcessPayload : public Context {
+    ImageWatcher *image_watcher;
+    uint64_t notify_id;
+    uint64_t handle;
+    watch_notify::Payload payload;
+
+    C_ProcessPayload(ImageWatcher *image_watcher_, uint64_t notify_id_,
+                     uint64_t handle_, const watch_notify::Payload &payload)
+      : image_watcher(image_watcher_), notify_id(notify_id_), handle(handle_),
+        payload(payload) {
+    }
+
+    virtual void finish(int r) override {
+      image_watcher->process_payload(notify_id, handle, payload, r);
+    }
+  };
+
   struct HandlePayloadVisitor : public boost::static_visitor<void> {
     ImageWatcher *image_watcher;
     uint64_t notify_id;
@@ -189,122 +197,99 @@ private:
     {
     }
 
-    inline void operator()(const WatchNotify::HeaderUpdatePayload &payload) const {
-      bufferlist out;
-      image_watcher->handle_payload(payload, &out);
-      image_watcher->acknowledge_notify(notify_id, handle, out);
-    }
-
     template <typename Payload>
     inline void operator()(const Payload &payload) const {
-      bufferlist out;
-      image_watcher->handle_payload(payload, &out);
-      image_watcher->acknowledge_notify(notify_id, handle, out);
+      C_NotifyAck *ctx = new C_NotifyAck(image_watcher, notify_id,
+                                                handle);
+      if (image_watcher->handle_payload(payload, ctx)) {
+        ctx->complete(0);
+      }
     }
   };
 
   ImageCtx &m_image_ctx;
 
-  RWLock m_watch_lock;
+  mutable RWLock m_watch_lock;
   WatchCtx m_watch_ctx;
   uint64_t m_watch_handle;
   WatchState m_watch_state;
 
-  bool m_lock_supported;
-
-  LockOwnerState m_lock_owner_state;
-
-  Mutex m_listeners_lock;
-  Cond m_listeners_cond;
-  Listeners m_listeners;
-  bool m_listeners_in_use;
-
   TaskFinisher<Task> *m_task_finisher;
 
   RWLock m_async_request_lock;
-  std::map<WatchNotify::AsyncRequestId, AsyncRequest> m_async_requests;
-  std::set<WatchNotify::AsyncRequestId> m_async_pending;
+  std::map<watch_notify::AsyncRequestId, AsyncRequest> m_async_requests;
+  std::set<watch_notify::AsyncRequestId> m_async_pending;
 
   Mutex m_owner_client_id_lock;
-  WatchNotify::ClientId m_owner_client_id;
-
-  std::string encode_lock_cookie() const;
-  static bool decode_lock_cookie(const std::string &cookie, uint64_t *handle);
-
-  int get_lock_owner_info(entity_name_t *locker, std::string *cookie,
-                          std::string *address, uint64_t *handle);
-  int lock();
-  int unlock();
-  bool try_request_lock();
+  watch_notify::ClientId m_owner_client_id;
 
   void schedule_cancel_async_requests();
   void cancel_async_requests();
 
-  void set_owner_client_id(const WatchNotify::ClientId &client_id);
-  WatchNotify::ClientId get_client_id();
-
-  void notify_acquired_lock();
-  void notify_release_lock();
-  void notify_released_lock();
+  void set_owner_client_id(const watch_notify::ClientId &client_id);
+  watch_notify::ClientId get_client_id();
 
   void schedule_request_lock(bool use_timer, int timer_delay = -1);
-  void notify_request_lock();
 
   int notify_lock_owner(bufferlist &bl);
 
-  void schedule_async_request_timed_out(const WatchNotify::AsyncRequestId &id);
-  void async_request_timed_out(const WatchNotify::AsyncRequestId &id);
-  int notify_async_request(const WatchNotify::AsyncRequestId &id,
+  void schedule_async_request_timed_out(const watch_notify::AsyncRequestId &id);
+  void async_request_timed_out(const watch_notify::AsyncRequestId &id);
+  int notify_async_request(const watch_notify::AsyncRequestId &id,
                            bufferlist &in, ProgressContext& prog_ctx);
   void notify_request_leadership();
 
-  void schedule_async_progress(const WatchNotify::AsyncRequestId &id,
+  void schedule_async_progress(const watch_notify::AsyncRequestId &id,
                                uint64_t offset, uint64_t total);
-  int notify_async_progress(const WatchNotify::AsyncRequestId &id,
+  int notify_async_progress(const watch_notify::AsyncRequestId &id,
                             uint64_t offset, uint64_t total);
-  void schedule_async_complete(const WatchNotify::AsyncRequestId &id, int r);
-  int notify_async_complete(const WatchNotify::AsyncRequestId &id, int r);
+  void schedule_async_complete(const watch_notify::AsyncRequestId &id, int r);
+  int notify_async_complete(const watch_notify::AsyncRequestId &id, int r);
 
-  int prepare_async_request(const WatchNotify::AsyncRequestId& id,
+  int prepare_async_request(const watch_notify::AsyncRequestId& id,
                             bool* new_request, Context** ctx,
                             ProgressContext** prog_ctx);
-  void cleanup_async_request(const WatchNotify::AsyncRequestId& id,
-                             Context *ctx);
-
-  void handle_payload(const WatchNotify::HeaderUpdatePayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::AcquiredLockPayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::ReleasedLockPayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::RequestLockPayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::AsyncProgressPayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::AsyncCompletePayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::FlattenPayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::ResizePayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::SnapCreatePayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::SnapRenamePayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::SnapRemovePayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::RebuildObjectMapPayload& payload,
-                      bufferlist *out);
-  void handle_payload(const WatchNotify::UnknownPayload& payload,
-                      bufferlist *out);
+
+  bool handle_payload(const watch_notify::HeaderUpdatePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::AcquiredLockPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::ReleasedLockPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::RequestLockPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::AsyncProgressPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::AsyncCompletePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::FlattenPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::ResizePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::SnapCreatePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::SnapRenamePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::SnapRemovePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::SnapProtectPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::SnapUnprotectPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::RebuildObjectMapPayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::RenamePayload& payload,
+                      C_NotifyAck *ctx);
+  bool handle_payload(const watch_notify::UnknownPayload& payload,
+                      C_NotifyAck *ctx);
+  void process_payload(uint64_t notify_id, uint64_t handle,
+                       const watch_notify::Payload &payload, int r);
 
   void handle_notify(uint64_t notify_id, uint64_t handle, bufferlist &bl);
   void handle_error(uint64_t cookie, int err);
   void acknowledge_notify(uint64_t notify_id, uint64_t handle, bufferlist &out);
 
   void reregister_watch();
-
-  void notify_listeners_updated_lock(LockUpdateState lock_update_state);
 };
 
 } // namespace librbd
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
index 37e311e..903a458 100644
--- a/src/librbd/Journal.cc
+++ b/src/librbd/Journal.cc
@@ -5,12 +5,16 @@
 #include "librbd/AioCompletion.h"
 #include "librbd/AioImageRequestWQ.h"
 #include "librbd/AioObjectRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/JournalReplay.h"
 #include "librbd/JournalTypes.h"
+#include "librbd/Utils.h"
 #include "journal/Journaler.h"
 #include "journal/ReplayEntry.h"
 #include "common/errno.h"
+#include <boost/utility/enable_if.hpp>
+#include <boost/type_traits/is_base_of.hpp>
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -22,13 +26,37 @@ namespace {
 
 const std::string CLIENT_DESCRIPTION = "master image";
 
-struct C_DestroyJournaler : public Context {
+struct SetOpRequestTid : public boost::static_visitor<void> {
+  uint64_t tid;
+
+  SetOpRequestTid(uint64_t _tid) : tid(_tid) {
+  }
+
+  template <typename Event>
+  typename boost::enable_if<boost::is_base_of<journal::OpEventBase, Event>,
+                            void>::type
+  operator()(Event &event) const {
+    event.tid = tid;
+  }
+
+  template <typename Event>
+  typename boost::disable_if<boost::is_base_of<journal::OpEventBase, Event>,
+                            void>::type
+  operator()(Event &event) const {
+    assert(false);
+  }
+};
+
+struct C_ReplayCommitted : public Context {
   ::journal::Journaler *journaler;
+  ::journal::ReplayEntry replay_entry;
 
-  C_DestroyJournaler(::journal::Journaler *_journaler) : journaler(_journaler) {
+  C_ReplayCommitted(::journal::Journaler *journaler,
+		    ::journal::ReplayEntry &&replay_entry) :
+    journaler(journaler), replay_entry(std::move(replay_entry)) {
   }
   virtual void finish(int r) {
-    delete journaler;
+    journaler->committed(replay_entry);
   }
 };
 
@@ -37,27 +65,18 @@ struct C_DestroyJournaler : public Context {
 Journal::Journal(ImageCtx &image_ctx)
   : m_image_ctx(image_ctx), m_journaler(NULL),
     m_lock("Journal::m_lock"), m_state(STATE_UNINITIALIZED),
-    m_lock_listener(this), m_replay_handler(this), m_close_pending(false),
+    m_error_result(0), m_replay_handler(this), m_close_pending(false),
     m_event_lock("Journal::m_event_lock"), m_event_tid(0),
     m_blocking_writes(false), m_journal_replay(NULL) {
 
   ldout(m_image_ctx.cct, 5) << this << ": ictx=" << &m_image_ctx << dendl;
-
-  m_image_ctx.image_watcher->register_listener(&m_lock_listener);
-
-  Mutex::Locker locker(m_lock);
-  block_writes();
 }
 
 Journal::~Journal() {
-  m_image_ctx.op_work_queue->drain();
+  assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED);
   assert(m_journaler == NULL);
   assert(m_journal_replay == NULL);
-
-  m_image_ctx.image_watcher->unregister_listener(&m_lock_listener);
-
-  Mutex::Locker locker(m_lock);
-  unblock_writes();
+  assert(m_wait_for_state_contexts.empty());
 }
 
 bool Journal::is_journal_supported(ImageCtx &image_ctx) {
@@ -66,15 +85,30 @@ bool Journal::is_journal_supported(ImageCtx &image_ctx) {
           !image_ctx.read_only && image_ctx.snap_id == CEPH_NOSNAP);
 }
 
-int Journal::create(librados::IoCtx &io_ctx, const std::string &image_id) {
+int Journal::create(librados::IoCtx &io_ctx, const std::string &image_id,
+		    uint8_t order, uint8_t splay_width,
+		    const std::string &object_pool) {
   CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
   ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
 
-  // TODO configurable commit flush interval
-  ::journal::Journaler journaler(io_ctx, image_id, "", 5);
+  int64_t pool_id = -1;
+  if (!object_pool.empty()) {
+    librados::Rados rados(io_ctx);
+    IoCtx data_io_ctx;
+    int r = rados.ioctx_create(object_pool.c_str(), data_io_ctx);
+    if (r != 0) {
+      lderr(cct) << "failed to create journal: "
+		 << "error opening journal objects pool '" << object_pool
+		 << "': " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    pool_id = data_io_ctx.get_id();
+  }
+
+  ::journal::Journaler journaler(io_ctx, image_id, "",
+				 cct->_conf->rbd_journal_commit_age);
 
-  // TODO order / splay width via config / image metadata / data pool
-  int r = journaler.create(24, 4, io_ctx.get_id());
+  int r = journaler.create(order, splay_width, pool_id);
   if (r < 0) {
     lderr(cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
     return r;
@@ -92,13 +126,22 @@ int Journal::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
   CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
   ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
 
-  // TODO configurable commit flush interval
-  ::journal::Journaler journaler(io_ctx, image_id, "", 5);
+  ::journal::Journaler journaler(io_ctx, image_id, "",
+				 cct->_conf->rbd_journal_commit_age);
+
+  bool journal_exists;
+  int r = journaler.exists(&journal_exists);
+  if (r < 0) {
+    lderr(cct) << "failed to stat journal header: " << cpp_strerror(r) << dendl;
+    return r;
+  } else if (!journal_exists) {
+    return 0;
+  }
 
   C_SaferCond cond;
   journaler.init(&cond);
 
-  int r = cond.wait();
+  r = cond.wait();
   if (r == -ENOENT) {
     return 0;
   } else if (r < 0) {
@@ -106,7 +149,7 @@ int Journal::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
     return r;
   }
 
-  r = journaler.remove();
+  r = journaler.remove(false);
   if (r < 0) {
     lderr(cct) << "failed to remove journal: " << cpp_strerror(r) << dendl;
     return r;
@@ -114,9 +157,49 @@ int Journal::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
   return 0;
 }
 
+int Journal::reset(librados::IoCtx &io_ctx, const std::string &image_id) {
+  CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+  ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+  ::journal::Journaler journaler(io_ctx, image_id, "",
+				 cct->_conf->rbd_journal_commit_age);
+
+  C_SaferCond cond;
+  journaler.init(&cond);
+
+  int r = cond.wait();
+  if (r == -ENOENT) {
+    return 0;
+  } else if (r < 0) {
+    lderr(cct) << "failed to initialize journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  uint8_t order, splay_width;
+  int64_t pool_id;
+  journaler.get_metadata(&order, &splay_width, &pool_id);
+
+  r = journaler.remove(true);
+  if (r < 0) {
+    lderr(cct) << "failed to reset journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = journaler.create(order, splay_width, pool_id);
+  if (r < 0) {
+    lderr(cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = journaler.register_client(CLIENT_DESCRIPTION);
+  if (r < 0) {
+    lderr(cct) << "failed to register client: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
 bool Journal::is_journal_ready() const {
   Mutex::Locker locker(m_lock);
-  return (m_state == STATE_RECORDING);
+  return (m_state == STATE_READY);
 }
 
 bool Journal::is_journal_replaying() const {
@@ -124,70 +207,55 @@ bool Journal::is_journal_replaying() const {
   return (m_state == STATE_REPLAYING);
 }
 
-bool Journal::wait_for_journal_ready() {
-  Mutex::Locker locker(m_lock);
-  while (m_state != STATE_UNINITIALIZED && m_state != STATE_RECORDING) {
-    wait_for_state_transition();
-  }
-  return (m_state == STATE_RECORDING);
-}
+void Journal::wait_for_journal_ready(Context *on_ready) {
+  on_ready = util::create_async_context_callback(m_image_ctx, on_ready);
 
-void Journal::open() {
   Mutex::Locker locker(m_lock);
-  if (m_journaler != NULL) {
-    return;
+  if (m_state == STATE_READY) {
+    on_ready->complete(m_error_result);
+  } else {
+    wait_for_steady_state(on_ready);
   }
+}
 
+void Journal::open(Context *on_finish) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  on_finish = util::create_async_context_callback(m_image_ctx, on_finish);
+
+  Mutex::Locker locker(m_lock);
+  assert(m_state == STATE_UNINITIALIZED);
+  wait_for_steady_state(on_finish);
   create_journaler();
 }
 
-int Journal::close() {
+void Journal::close(Context *on_finish) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << this << " " << __func__ << ": state=" << m_state << dendl;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  on_finish = util::create_async_context_callback(m_image_ctx, on_finish);
 
   Mutex::Locker locker(m_lock);
-  if (m_state == STATE_UNINITIALIZED) {
-    return 0;
+  assert(m_state != STATE_UNINITIALIZED);
+  if (m_state == STATE_CLOSED) {
+    on_finish->complete(m_error_result);
+    return;
   }
 
-  int r;
-  bool done = false;
-  while (!done) {
-    switch (m_state) {
-    case STATE_UNINITIALIZED:
-      done = true;
-      break;
-    case STATE_INITIALIZING:
-    case STATE_REPLAYING:
-      m_close_pending = true;
-      wait_for_state_transition();
-      break;
-    case STATE_STOPPING_RECORDING:
-      wait_for_state_transition();
-      break;
-    case STATE_RECORDING:
-      r = stop_recording();
-      if (r < 0) {
-        return r;
-      }
-      done = true;
-      break;
-    default:
-      assert(false);
-    }
+  if (m_state == STATE_READY) {
+    stop_recording();
   }
 
-  destroy_journaler();
-  return 0;
+  m_close_pending = true;
+  wait_for_steady_state(on_finish);
 }
 
-uint64_t Journal::append_event(AioCompletion *aio_comp,
-                               const journal::EventEntry &event_entry,
-                               const AioObjectRequests &requests,
-                               uint64_t offset, size_t length,
-                               bool flush_entry) {
+uint64_t Journal::append_io_event(AioCompletion *aio_comp,
+                                  const journal::EventEntry &event_entry,
+                                  const AioObjectRequests &requests,
+                                  uint64_t offset, size_t length,
+                                  bool flush_entry) {
   assert(m_image_ctx.owner_lock.is_locked());
 
   bufferlist bl;
@@ -197,7 +265,7 @@ uint64_t Journal::append_event(AioCompletion *aio_comp,
   uint64_t tid;
   {
     Mutex::Locker locker(m_lock);
-    assert(m_state == STATE_RECORDING);
+    assert(m_state == STATE_READY);
 
     future = m_journaler->append("", bl);
 
@@ -225,7 +293,7 @@ uint64_t Journal::append_event(AioCompletion *aio_comp,
   return tid;
 }
 
-void Journal::commit_event(uint64_t tid, int r) {
+void Journal::commit_io_event(uint64_t tid, int r) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
                  "r=" << r << dendl;
@@ -238,8 +306,8 @@ void Journal::commit_event(uint64_t tid, int r) {
   complete_event(it, r);
 }
 
-void Journal::commit_event_extent(uint64_t tid, uint64_t offset,
-                                  uint64_t length, int r) {
+void Journal::commit_io_event_extent(uint64_t tid, uint64_t offset,
+                                     uint64_t length, int r) {
   assert(length > 0);
 
   CephContext *cct = m_image_ctx.cct;
@@ -273,6 +341,50 @@ void Journal::commit_event_extent(uint64_t tid, uint64_t offset,
   complete_event(it, event.ret_val);
 }
 
+uint64_t Journal::append_op_event(journal::EventEntry &event_entry) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  uint64_t tid;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_state == STATE_READY);
+
+    Mutex::Locker event_locker(m_event_lock);
+    tid = ++m_event_tid;
+    assert(tid != 0);
+
+    // inject the generated tid into the provided event entry
+    boost::apply_visitor(SetOpRequestTid(tid), event_entry.event);
+
+    bufferlist bl;
+    ::encode(event_entry, bl);
+    m_journaler->committed(m_journaler->append("", bl));
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": "
+                 << "event=" << event_entry.get_event_type() << ", "
+                 << "tid=" << tid << dendl;
+  return tid;
+}
+
+void Journal::commit_op_event(uint64_t tid, int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": tid=" << tid << dendl;
+
+  journal::EventEntry event_entry((journal::OpFinishEvent(tid, r)));
+
+  bufferlist bl;
+  ::encode(event_entry, bl);
+
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_state == STATE_READY);
+
+    m_journaler->committed(m_journaler->append("", bl));
+  }
+}
+
 void Journal::flush_event(uint64_t tid, Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
@@ -321,36 +433,45 @@ void Journal::create_journaler() {
   ldout(cct, 20) << this << " " << __func__ << dendl;
 
   assert(m_lock.is_locked());
-  assert(m_state == STATE_UNINITIALIZED);
+  assert(m_state == STATE_UNINITIALIZED || m_state == STATE_RESTARTING_REPLAY);
+  assert(m_journaler == NULL);
 
-  // TODO allow alternate pool for journal objects and commit flush interval
-  m_close_pending = false;
+  transition_state(STATE_INITIALIZING, 0);
   m_journaler = new ::journal::Journaler(m_image_ctx.md_ctx, m_image_ctx.id, "",
-                                         5);
-
+                                         m_image_ctx.journal_commit_age);
   m_journaler->init(new C_InitJournal(this));
-  transition_state(STATE_INITIALIZING);
 }
 
-void Journal::destroy_journaler() {
+void Journal::destroy_journaler(int r) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << this << " " << __func__ << dendl;
+  ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
 
   assert(m_lock.is_locked());
 
   delete m_journal_replay;
   m_journal_replay = NULL;
 
-  m_close_pending = false;
-  m_image_ctx.op_work_queue->queue(new C_DestroyJournaler(m_journaler), 0);
-  m_journaler = NULL;
+  transition_state(STATE_CLOSING, r);
+  m_image_ctx.op_work_queue->queue(new C_DestroyJournaler(this), 0);
+}
+
+void Journal::recreate_journaler(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+  assert(m_lock.is_locked());
+  assert(m_state == STATE_REPLAYING);
+
+  delete m_journal_replay;
+  m_journal_replay = NULL;
 
-  transition_state(STATE_UNINITIALIZED);
+  transition_state(STATE_RESTARTING_REPLAY, r);
+  m_image_ctx.op_work_queue->queue(new C_DestroyJournaler(this), 0);
 }
 
 void Journal::complete_event(Events::iterator it, int r) {
   assert(m_event_lock.is_locked());
-  assert(m_state == STATE_RECORDING);
+  assert(m_state == STATE_READY);
 
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": tid=" << it->first << " "
@@ -364,26 +485,20 @@ void Journal::complete_event(Events::iterator it, int r) {
 
 void Journal::handle_initialized(int r) {
   CephContext *cct = m_image_ctx.cct;
-  if (r < 0) {
-    lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
-    Mutex::Locker locker(m_lock);
+  ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
 
-    // TODO: failed to open journal -- retry?
-    destroy_journaler();
-    create_journaler();
-    return;
-  }
-
-  ldout(cct, 20) << this << " " << __func__ << dendl;
   Mutex::Locker locker(m_lock);
-  if (m_close_pending) {
-    destroy_journaler();
+
+  if (r < 0) {
+    lderr(cct) << this << " " << __func__
+               << "failed to initialize journal: " << cpp_strerror(r)
+               << dendl;
+    destroy_journaler(r);
     return;
   }
 
+  transition_state(STATE_REPLAYING, 0);
   m_journal_replay = new JournalReplay(m_image_ctx);
-
-  transition_state(STATE_REPLAYING);
   m_journaler->start_replay(&m_replay_handler);
 }
 
@@ -397,12 +512,6 @@ void Journal::handle_replay_ready() {
   }
 
   while (true) {
-    if (m_close_pending) {
-      m_journaler->stop_replay();
-      destroy_journaler();
-      return;
-    }
-
     ::journal::ReplayEntry replay_entry;
     if (!m_journaler->try_pop_front(&replay_entry)) {
       return;
@@ -411,11 +520,21 @@ void Journal::handle_replay_ready() {
     m_lock.Unlock();
     bufferlist data = replay_entry.get_data();
     bufferlist::iterator it = data.begin();
-    int r = m_journal_replay->process(it);
+    int r = m_journal_replay->process(
+      it, new C_ReplayCommitted(m_journaler, std::move(replay_entry)));
     m_lock.Lock();
 
     if (r < 0) {
-      // TODO
+      lderr(cct) << "failed to replay journal entry: " << cpp_strerror(r)
+                 << dendl;
+      m_journaler->stop_replay();
+
+      if (m_close_pending) {
+        destroy_journaler(r);
+        return;
+      }
+
+      recreate_journaler(r);
     }
   }
 }
@@ -429,38 +548,66 @@ void Journal::handle_replay_complete(int r) {
       return;
     }
 
+    ldout(cct, 20) << this << " " << __func__ << dendl;
+    m_journaler->stop_replay();
+
     if (r == 0) {
       r = m_journal_replay->flush();
     }
-    delete m_journal_replay;
-    m_journal_replay = NULL;
 
     if (r < 0) {
       lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
-
-      // TODO: failed to replay journal -- retry?
-      destroy_journaler();
-      create_journaler();
+      recreate_journaler(r);
       return;
     }
 
-    ldout(cct, 20) << this << " " << __func__ << dendl;
-    m_journaler->stop_replay();
+    delete m_journal_replay;
+    m_journal_replay = NULL;
 
     if (m_close_pending) {
-      destroy_journaler();
+      destroy_journaler(0);
       return;
     }
 
-    // TODO configurable flush interval, flush bytes, and flush age
-    m_journaler->start_append(0, 0, 0);
-    transition_state(STATE_RECORDING);
+    m_error_result = 0;
+    m_journaler->start_append(m_image_ctx.journal_object_flush_interval,
+			      m_image_ctx.journal_object_flush_bytes,
+			      m_image_ctx.journal_object_flush_age);
+    transition_state(STATE_READY, 0);
+  }
+}
+
+void Journal::handle_recording_stopped(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(m_state == STATE_STOPPING);
+
+  destroy_journaler(r);
+}
 
-    unblock_writes();
+void Journal::handle_journal_destroyed(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << this << " " << __func__
+               << "error detected while closing journal: " << cpp_strerror(r)
+               << dendl;
   }
 
-  // kick peers to let them know they can re-request the lock now
-  m_image_ctx.image_watcher->notify_lock_state();
+  Mutex::Locker locker(m_lock);
+  delete m_journaler;
+  m_journaler = nullptr;
+
+  assert(m_state == STATE_CLOSING || m_state == STATE_RESTARTING_REPLAY);
+  if (m_state == STATE_RESTARTING_REPLAY) {
+    create_journaler();
+    return;
+  }
+
+  transition_state(STATE_CLOSED, r);
 }
 
 void Journal::handle_event_safe(int r, uint64_t tid) {
@@ -497,8 +644,6 @@ void Journal::handle_event_safe(int r, uint64_t tid) {
   } else {
     // send any waiting aio requests now that journal entry is safe
     RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-    assert(m_image_ctx.image_watcher->is_lock_owner());
-
     for (AioObjectRequests::iterator it = aio_object_requests.begin();
          it != aio_object_requests.end(); ++it) {
       (*it)->send();
@@ -512,79 +657,15 @@ void Journal::handle_event_safe(int r, uint64_t tid) {
   }
 }
 
-bool Journal::handle_requested_lock() {
-  Mutex::Locker locker(m_lock);
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << this << " " << __func__ << ": " << "state=" << m_state
-                 << dendl;
-
-  // prevent peers from taking our lock while we are replaying since that
-  // will stale forward progress
-  return (m_state != STATE_INITIALIZING && m_state != STATE_REPLAYING);
-}
-
-void Journal::handle_lock_updated(ImageWatcher::LockUpdateState state) {
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << this << " " << __func__ << ": "
-                 << "state=" << state << dendl;
-
-  Mutex::Locker locker(m_lock);
-  if (state == ImageWatcher::LOCK_UPDATE_STATE_LOCKED &&
-      m_state == STATE_UNINITIALIZED) {
-    create_journaler();
-  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_RELEASING) {
-    if (m_state == STATE_INITIALIZING || m_state == STATE_REPLAYING) {
-      // wait for replay to successfully interrupt
-      m_close_pending = true;
-      wait_for_state_transition();
-    }
-
-    if (m_state == STATE_UNINITIALIZED || m_state == STATE_RECORDING) {
-      // prevent new write ops but allow pending ops to flush to the journal
-      block_writes();
-    }
-    if (m_state == STATE_RECORDING) {
-      flush_journal();
-    }
-  } else if ((state == ImageWatcher::LOCK_UPDATE_STATE_NOT_SUPPORTED ||
-              state == ImageWatcher::LOCK_UPDATE_STATE_UNLOCKED) &&
-             m_state != STATE_UNINITIALIZED &&
-             m_state != STATE_STOPPING_RECORDING) {
-    assert(m_state == STATE_RECORDING);
-    {
-      Mutex::Locker event_locker(m_event_lock);
-      assert(m_events.empty());
-    }
-
-    int r = stop_recording();
-    if (r < 0) {
-      // TODO handle failed journal writes
-      assert(false);
-    }
-  }
-}
-
-int Journal::stop_recording() {
+void Journal::stop_recording() {
   assert(m_lock.is_locked());
   assert(m_journaler != NULL);
 
-  transition_state(STATE_STOPPING_RECORDING);
+  assert(m_state == STATE_READY);
+  transition_state(STATE_STOPPING, 0);
 
-  C_SaferCond cond;
-  m_lock.Unlock();
-  m_journaler->stop_append(&cond);
-  int r = cond.wait();
-  m_lock.Lock();
-
-  destroy_journaler();
-  if (r < 0) {
-    lderr(m_image_ctx.cct) << "failed to flush journal: " << cpp_strerror(r)
-                           << dendl;
-    return r;
-  }
-  return 0;
+  m_journaler->stop_append(util::create_async_context_callback(
+    m_image_ctx, new C_StopRecording(this)));
 }
 
 void Journal::block_writes() {
@@ -603,33 +684,82 @@ void Journal::unblock_writes() {
   }
 }
 
-void Journal::flush_journal() {
+void Journal::transition_state(State state, int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl;
   assert(m_lock.is_locked());
+  m_state = state;
 
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << this << " " << __func__ << dendl;
+  if (m_error_result == 0 && r < 0) {
+    m_error_result = r;
+  }
 
-  m_lock.Unlock();
-  C_SaferCond cond_ctx;
-  m_journaler->flush(&cond_ctx);
-  cond_ctx.wait();
-  m_lock.Lock();
+  if (is_steady_state()) {
+    Contexts wait_for_state_contexts(std::move(m_wait_for_state_contexts));
+    for (auto ctx : wait_for_state_contexts) {
+      ctx->complete(m_error_result);
+    }
+  }
 }
 
-void Journal::transition_state(State state) {
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl;
+bool Journal::is_steady_state() const {
   assert(m_lock.is_locked());
-  m_state = state;
-  m_cond.Signal();
+  switch (m_state) {
+  case STATE_READY:
+  case STATE_CLOSED:
+    return true;
+  case STATE_UNINITIALIZED:
+  case STATE_INITIALIZING:
+  case STATE_REPLAYING:
+  case STATE_RESTARTING_REPLAY:
+  case STATE_STOPPING:
+  case STATE_CLOSING:
+    break;
+  }
+  return false;
 }
 
-void Journal::wait_for_state_transition() {
+void Journal::wait_for_steady_state(Context *on_state) {
   assert(m_lock.is_locked());
-  State state = m_state;
-  while (m_state == state) {
-    m_cond.Wait(m_lock);
-  }
+  assert(!is_steady_state());
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": on_state=" << on_state
+                 << dendl;
+  m_wait_for_state_contexts.push_back(on_state);
+}
+
+std::ostream &operator<<(std::ostream &os, const Journal::State &state) {
+  switch (state) {
+  case Journal::STATE_UNINITIALIZED:
+    os << "Uninitialized";
+    break;
+  case Journal::STATE_INITIALIZING:
+    os << "Initializing";
+    break;
+  case Journal::STATE_REPLAYING:
+    os << "Replaying";
+    break;
+  case Journal::STATE_RESTARTING_REPLAY:
+    os << "RestartingReplay";
+    break;
+  case Journal::STATE_READY:
+    os << "Ready";
+    break;
+  case Journal::STATE_STOPPING:
+    os << "Stopping";
+    break;
+  case Journal::STATE_CLOSING:
+    os << "Closing";
+    break;
+  case Journal::STATE_CLOSED:
+    os << "Closed";
+    break;
+  default:
+    os << "Unknown (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
 }
 
 } // namespace librbd
diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h
index f6368b2..0ea9ae4 100644
--- a/src/librbd/Journal.h
+++ b/src/librbd/Journal.h
@@ -10,10 +10,8 @@
 #include "include/unordered_map.h"
 #include "include/rados/librados.hpp"
 #include "common/Mutex.h"
-#include "common/Cond.h"
 #include "journal/Future.h"
 #include "journal/ReplayHandler.h"
-#include "librbd/ImageWatcher.h"
 #include <algorithm>
 #include <list>
 #include <string>
@@ -42,26 +40,31 @@ public:
   ~Journal();
 
   static bool is_journal_supported(ImageCtx &image_ctx);
-  static int create(librados::IoCtx &io_ctx, const std::string &image_id);
+  static int create(librados::IoCtx &io_ctx, const std::string &image_id,
+		    uint8_t order, uint8_t splay_width,
+		    const std::string &object_pool);
   static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
+  static int reset(librados::IoCtx &io_ctx, const std::string &image_id);
 
   bool is_journal_ready() const;
   bool is_journal_replaying() const;
 
-  bool wait_for_journal_ready();
+  void wait_for_journal_ready(Context *on_ready);
 
-  void open();
-  int close();
+  void open(Context *on_finish);
+  void close(Context *on_finish);
 
-  uint64_t append_event(AioCompletion *aio_comp,
-                        const journal::EventEntry &event_entry,
-                        const AioObjectRequests &requests,
-                        uint64_t offset, size_t length,
-                        bool flush_entry);
+  uint64_t append_io_event(AioCompletion *aio_comp,
+                           const journal::EventEntry &event_entry,
+                           const AioObjectRequests &requests,
+                           uint64_t offset, size_t length,
+                           bool flush_entry);
+  void commit_io_event(uint64_t tid, int r);
+  void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length,
+                              int r);
 
-  void commit_event(uint64_t tid, int r);
-  void commit_event_extent(uint64_t tid, uint64_t offset, uint64_t length,
-                           int r);
+  uint64_t append_op_event(journal::EventEntry &event_entry);
+  void commit_op_event(uint64_t tid, int r);
 
   void flush_event(uint64_t tid, Context *on_safe);
   void wait_event(uint64_t tid, Context *on_safe);
@@ -70,12 +73,42 @@ private:
   typedef std::list<Context *> Contexts;
   typedef interval_set<uint64_t> ExtentInterval;
 
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> READY
+   *    |                 *  .  ^             *  .            |
+   *    |                 *  .  |             *  .            |
+   *    |                 *  .  |    (error)  *  . . . .      |
+   *    |                 *  .  |             *        .      |
+   *    |                 *  .  |             v        .      v
+   *    |                 *  .  |         RESTARTING   .    STOPPING
+   *    |                 *  .  |             |        .      |
+   *    |                 *  .  |             |        .      |
+   *    |       * * * * * *  .  \-------------/        .      |
+   *    |       * (error)    .                         .      |
+   *    |       *            .   . . . . . . . . . . . .      |
+   *    |       *            .   .                            |
+   *    |       v            v   v                            |
+   *    |     CLOSED <----- CLOSING <-------------------------/
+   *    |       |
+   *    |       v
+   *    \---> <finish>
+   *
+   * @endverbatim
+   */
   enum State {
     STATE_UNINITIALIZED,
     STATE_INITIALIZING,
     STATE_REPLAYING,
-    STATE_RECORDING,
-    STATE_STOPPING_RECORDING
+    STATE_RESTARTING_REPLAY,
+    STATE_READY,
+    STATE_STOPPING,
+    STATE_CLOSING,
+    STATE_CLOSED
   };
 
   struct Event {
@@ -100,27 +133,36 @@ private:
   };
   typedef ceph::unordered_map<uint64_t, Event> Events;
 
-  struct LockListener : public ImageWatcher::Listener {
+  struct C_InitJournal : public Context {
     Journal *journal;
-    LockListener(Journal *_journal) : journal(_journal) {
+
+    C_InitJournal(Journal *_journal) : journal(_journal) {
+    }
+
+    virtual void finish(int r) {
+      journal->handle_initialized(r);
     }
+  };
+
+  struct C_StopRecording : public Context {
+    Journal *journal;
 
-    virtual bool handle_requested_lock() {
-      return journal->handle_requested_lock();
+    C_StopRecording(Journal *_journal) : journal(_journal) {
     }
-    virtual void handle_lock_updated(ImageWatcher::LockUpdateState state) {
-      journal->handle_lock_updated(state);
+
+    virtual void finish(int r) {
+      journal->handle_recording_stopped(r);
     }
   };
 
-  struct C_InitJournal : public Context {
+  struct C_DestroyJournaler : public Context {
     Journal *journal;
 
-    C_InitJournal(Journal *_journal) : journal(_journal) {
+    C_DestroyJournaler(Journal *_journal) : journal(_journal) {
     }
 
     virtual void finish(int r) {
-      journal->handle_initialized(r);
+      journal->handle_journal_destroyed(r);
     }
   };
 
@@ -161,10 +203,10 @@ private:
 
   ::journal::Journaler *m_journaler;
   mutable Mutex m_lock;
-  Cond m_cond;
   State m_state;
 
-  LockListener m_lock_listener;
+  int m_error_result;
+  Contexts m_wait_for_state_contexts;
 
   ReplayHandler m_replay_handler;
   bool m_close_pending;
@@ -180,7 +222,8 @@ private:
   ::journal::Future wait_event(Mutex &lock, uint64_t tid, Context *on_safe);
 
   void create_journaler();
-  void destroy_journaler();
+  void destroy_journaler(int r);
+  void recreate_journaler(int r);
 
   void complete_event(Events::iterator it, int r);
 
@@ -189,19 +232,23 @@ private:
   void handle_replay_ready();
   void handle_replay_complete(int r);
 
-  void handle_event_safe(int r, uint64_t tid);
+  void handle_recording_stopped(int r);
+
+  void handle_journal_destroyed(int r);
 
-  bool handle_requested_lock();
-  void handle_lock_updated(ImageWatcher::LockUpdateState state);
+  void handle_event_safe(int r, uint64_t tid);
 
-  int stop_recording();
+  void stop_recording();
 
   void block_writes();
   void unblock_writes();
 
-  void flush_journal();
-  void transition_state(State state);
-  void wait_for_state_transition();
+  void transition_state(State state, int r);
+
+  bool is_steady_state() const;
+  void wait_for_steady_state(Context *on_state);
+
+  friend std::ostream &operator<<(std::ostream &os, const State &state);
 };
 
 } // namespace librbd
diff --git a/src/librbd/JournalReplay.cc b/src/librbd/JournalReplay.cc
index 7daf10c..3379f11 100644
--- a/src/librbd/JournalReplay.cc
+++ b/src/librbd/JournalReplay.cc
@@ -21,7 +21,7 @@ JournalReplay::~JournalReplay() {
   assert(m_aio_completions.empty());
 }
 
-int JournalReplay::process(bufferlist::iterator it) {
+int JournalReplay::process(bufferlist::iterator it, Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << dendl;
 
@@ -34,7 +34,7 @@ int JournalReplay::process(bufferlist::iterator it) {
     return -EINVAL;
   }
 
-  boost::apply_visitor(EventVisitor(this), event_entry.event);
+  boost::apply_visitor(EventVisitor(this, on_safe), event_entry.event);
   return 0;
 }
 
@@ -49,43 +49,111 @@ int JournalReplay::flush() {
   return m_ret_val;
 }
 
-void JournalReplay::handle_event(const journal::AioDiscardEvent &event) {
+void JournalReplay::handle_event(const journal::AioDiscardEvent &event,
+				 Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": AIO discard event" << dendl;
 
-  AioCompletion *aio_comp = create_aio_completion();
+  AioCompletion *aio_comp = create_aio_completion(on_safe);
   AioImageRequest::aio_discard(&m_image_ctx, aio_comp, event.offset,
                                event.length);
 }
 
-void JournalReplay::handle_event(const journal::AioWriteEvent &event) {
+void JournalReplay::handle_event(const journal::AioWriteEvent &event,
+				 Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": AIO write event" << dendl;
 
   bufferlist data = event.data;
-  AioCompletion *aio_comp = create_aio_completion();
+  AioCompletion *aio_comp = create_aio_completion(on_safe);
   AioImageRequest::aio_write(&m_image_ctx, aio_comp, event.offset, event.length,
                              data.c_str(), 0);
 }
 
-void JournalReplay::handle_event(const journal::AioFlushEvent &event) {
+void JournalReplay::handle_event(const journal::AioFlushEvent &event,
+				 Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": AIO flush event" << dendl;
 
-  AioCompletion *aio_comp = create_aio_completion();
+  AioCompletion *aio_comp = create_aio_completion(on_safe);
   AioImageRequest::aio_flush(&m_image_ctx, aio_comp);
 }
 
-void JournalReplay::handle_event(const journal::UnknownEvent &event) {
+  void JournalReplay::handle_event(const journal::OpFinishEvent &event,
+				   Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Op finish event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::SnapCreateEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Snap create event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::SnapRemoveEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Snap remove event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::SnapRenameEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Snap rename event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::SnapProtectEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Snap protect event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::SnapUnprotectEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Snap unprotect event"
+                 << dendl;
+}
+
+void JournalReplay::handle_event(const journal::SnapRollbackEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Snap rollback start event"
+                 << dendl;
+}
+
+void JournalReplay::handle_event(const journal::RenameEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Rename event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::ResizeEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Resize start event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::FlattenEvent &event,
+				 Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": Flatten start event" << dendl;
+}
+
+void JournalReplay::handle_event(const journal::UnknownEvent &event,
+				 Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << this << " " << __func__ << ": unknown event" << dendl;
+  on_safe->complete(0);
 }
 
-AioCompletion *JournalReplay::create_aio_completion() {
+AioCompletion *JournalReplay::create_aio_completion(Context *on_safe) {
   Mutex::Locker locker(m_lock);
-  AioCompletion *aio_comp = aio_create_completion_internal(
-    this, &aio_completion_callback);
-  m_aio_completions.insert(aio_comp);
+  AioCompletion *aio_comp = AioCompletion::create(this, aio_completion_callback,
+                                                  nullptr);
+  m_aio_completions.insert(std::pair<AioCompletion*,Context*>(
+			     aio_comp, on_safe));
   return aio_comp;
 }
 
@@ -101,12 +169,16 @@ void JournalReplay::handle_aio_completion(AioCompletion *aio_comp) {
   ldout(cct, 20) << this << " " << __func__ << ": aio_comp=" << aio_comp << ", "
                  << "r=" << r << dendl;
 
+  Context *on_safe = it->second;
+  on_safe->complete(r);
+
   if (r < 0 && m_ret_val == 0) {
     m_ret_val = r;
   }
 
   m_aio_completions.erase(it);
-  m_cond.Signal();
+  if (m_aio_completions.empty())
+    m_cond.Signal();
 }
 
 void JournalReplay::aio_completion_callback(completion_t cb, void *arg) {
diff --git a/src/librbd/JournalReplay.h b/src/librbd/JournalReplay.h
index 7b85713..6b5711a 100644
--- a/src/librbd/JournalReplay.h
+++ b/src/librbd/JournalReplay.h
@@ -5,13 +5,13 @@
 #define CEPH_LIBRBD_JOURNAL_REPLAY_H
 
 #include "include/int_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/rbd/librbd.hpp"
 #include "common/Cond.h"
 #include "common/Mutex.h"
 #include "librbd/JournalTypes.h"
 #include <boost/variant.hpp>
-#include <set>
+#include <map>
 
 namespace librbd {
 
@@ -23,22 +23,23 @@ public:
   JournalReplay(ImageCtx &image_ctx);
   ~JournalReplay();
 
-  int process(bufferlist::iterator it);
+  int process(bufferlist::iterator it, Context *on_safe = NULL);
   int flush();
 
 private:
-  typedef std::set<AioCompletion *> AioCompletions;
+  typedef std::map<AioCompletion*,Context*> AioCompletions;
 
   struct EventVisitor : public boost::static_visitor<void> {
     JournalReplay *journal_replay;
+    Context *on_safe;
 
-    EventVisitor(JournalReplay *_journal_replay)
-      : journal_replay(_journal_replay) {
+    EventVisitor(JournalReplay *_journal_replay, Context *_on_safe)
+      : journal_replay(_journal_replay), on_safe(_on_safe) {
     }
 
     template <typename Event>
     inline void operator()(const Event &event) const {
-      journal_replay->handle_event(event);
+      journal_replay->handle_event(event, on_safe);
     }
   };
 
@@ -50,12 +51,22 @@ private:
   AioCompletions m_aio_completions;
   int m_ret_val;
 
-  void handle_event(const journal::AioDiscardEvent &event);
-  void handle_event(const journal::AioWriteEvent &event);
-  void handle_event(const journal::AioFlushEvent &event);
-  void handle_event(const journal::UnknownEvent &event);
-
-  AioCompletion *create_aio_completion();
+  void handle_event(const journal::AioDiscardEvent &event, Context *on_safe);
+  void handle_event(const journal::AioWriteEvent &event, Context *on_safe);
+  void handle_event(const journal::AioFlushEvent &event, Context *on_safe);
+  void handle_event(const journal::OpFinishEvent &event, Context *on_safe);
+  void handle_event(const journal::SnapCreateEvent &event, Context *on_safe);
+  void handle_event(const journal::SnapRemoveEvent &event, Context *on_safe);
+  void handle_event(const journal::SnapRenameEvent &event, Context *on_safe);
+  void handle_event(const journal::SnapProtectEvent &event, Context *on_safe);
+  void handle_event(const journal::SnapUnprotectEvent &event, Context *on_safe);
+  void handle_event(const journal::SnapRollbackEvent &event, Context *on_safe);
+  void handle_event(const journal::RenameEvent &event, Context *on_safe);
+  void handle_event(const journal::ResizeEvent &event, Context *on_safe);
+  void handle_event(const journal::FlattenEvent &event, Context *on_safe);
+  void handle_event(const journal::UnknownEvent &event, Context *on_safe);
+
+  AioCompletion *create_aio_completion(Context *on_safe);
   void handle_aio_completion(AioCompletion *aio_comp);
 
   static void aio_completion_callback(completion_t cb, void *arg);
diff --git a/src/librbd/JournalTypes.cc b/src/librbd/JournalTypes.cc
index 4dcd2f5..cda35aa 100644
--- a/src/librbd/JournalTypes.cc
+++ b/src/librbd/JournalTypes.cc
@@ -105,6 +105,79 @@ void AioFlushEvent::decode(__u8 version, bufferlist::iterator& it) {
 void AioFlushEvent::dump(Formatter *f) const {
 }
 
+void OpEventBase::encode(bufferlist& bl) const {
+  ::encode(tid, bl);
+}
+
+void OpEventBase::decode(__u8 version, bufferlist::iterator& it) {
+  ::decode(tid, it);
+}
+
+void OpEventBase::dump(Formatter *f) const {
+  f->dump_unsigned("tid", tid);
+}
+
+void SnapEventBase::encode(bufferlist& bl) const {
+  OpStartEventBase::encode(bl);
+  ::encode(snap_name, bl);
+}
+
+void SnapEventBase::decode(__u8 version, bufferlist::iterator& it) {
+  OpStartEventBase::decode(version, it);
+  ::decode(snap_name, it);
+}
+
+void SnapEventBase::dump(Formatter *f) const {
+  OpStartEventBase::dump(f);
+  f->dump_string("snap_name", snap_name);
+}
+
+void SnapRenameEvent::encode(bufferlist& bl) const {
+  SnapEventBase::encode(bl);
+  ::encode(snap_id, bl);
+}
+
+void SnapRenameEvent::decode(__u8 version, bufferlist::iterator& it) {
+  SnapEventBase::decode(version, it);
+  ::decode(snap_id, it);
+}
+
+void SnapRenameEvent::dump(Formatter *f) const {
+  OpStartEventBase::dump(f);
+  f->dump_unsigned("src_snap_id", snap_id);
+  f->dump_string("dest_snap_name", snap_name);
+}
+
+void RenameEvent::encode(bufferlist& bl) const {
+  OpStartEventBase::encode(bl);
+  ::encode(image_name, bl);
+}
+
+void RenameEvent::decode(__u8 version, bufferlist::iterator& it) {
+  OpStartEventBase::decode(version, it);
+  ::decode(image_name, it);
+}
+
+void RenameEvent::dump(Formatter *f) const {
+  OpStartEventBase::dump(f);
+  f->dump_string("image_name", image_name);
+}
+
+void ResizeEvent::encode(bufferlist& bl) const {
+  OpStartEventBase::encode(bl);
+  ::encode(size, bl);
+}
+
+void ResizeEvent::decode(__u8 version, bufferlist::iterator& it) {
+  OpStartEventBase::decode(version, it);
+  ::decode(size, it);
+}
+
+void ResizeEvent::dump(Formatter *f) const {
+  OpStartEventBase::dump(f);
+  f->dump_unsigned("size", size);
+}
+
 void UnknownEvent::encode(bufferlist& bl) const {
   assert(false);
 }
@@ -142,6 +215,36 @@ void EventEntry::decode(bufferlist::iterator& it) {
   case EVENT_TYPE_AIO_FLUSH:
     event = AioFlushEvent();
     break;
+  case EVENT_TYPE_OP_FINISH:
+    event = OpFinishEvent();
+    break;
+  case EVENT_TYPE_SNAP_CREATE:
+    event = SnapCreateEvent();
+    break;
+  case EVENT_TYPE_SNAP_REMOVE:
+    event = SnapRemoveEvent();
+    break;
+  case EVENT_TYPE_SNAP_RENAME:
+    event = SnapRenameEvent();
+    break;
+  case EVENT_TYPE_SNAP_PROTECT:
+    event = SnapProtectEvent();
+    break;
+  case EVENT_TYPE_SNAP_UNPROTECT:
+    event = SnapUnprotectEvent();
+    break;
+  case EVENT_TYPE_SNAP_ROLLBACK:
+    event = SnapRollbackEvent();
+    break;
+  case EVENT_TYPE_RENAME:
+    event = RenameEvent();
+    break;
+  case EVENT_TYPE_RESIZE:
+    event = ResizeEvent();
+    break;
+  case EVENT_TYPE_FLATTEN:
+    event = FlattenEvent();
+    break;
   default:
     event = UnknownEvent();
     break;
@@ -165,6 +268,34 @@ void EventEntry::generate_test_instances(std::list<EventEntry *> &o) {
   o.push_back(new EventEntry(AioWriteEvent(123, 456, bl)));
 
   o.push_back(new EventEntry(AioFlushEvent()));
+
+  o.push_back(new EventEntry(OpFinishEvent(123, -1)));
+
+  o.push_back(new EventEntry(SnapCreateEvent()));
+  o.push_back(new EventEntry(SnapCreateEvent(234, "snap")));
+
+  o.push_back(new EventEntry(SnapRemoveEvent()));
+  o.push_back(new EventEntry(SnapRemoveEvent(345, "snap")));
+
+  o.push_back(new EventEntry(SnapRenameEvent()));
+  o.push_back(new EventEntry(SnapRenameEvent(345, 1, "snap")));
+
+  o.push_back(new EventEntry(SnapProtectEvent()));
+  o.push_back(new EventEntry(SnapProtectEvent(456, "snap")));
+
+  o.push_back(new EventEntry(SnapUnprotectEvent()));
+  o.push_back(new EventEntry(SnapUnprotectEvent(567, "snap")));
+
+  o.push_back(new EventEntry(SnapRollbackEvent()));
+  o.push_back(new EventEntry(SnapRollbackEvent(678, "snap")));
+
+  o.push_back(new EventEntry(RenameEvent()));
+  o.push_back(new EventEntry(RenameEvent(789, "image name")));
+
+  o.push_back(new EventEntry(ResizeEvent()));
+  o.push_back(new EventEntry(ResizeEvent(890, 1234)));
+
+  o.push_back(new EventEntry(FlattenEvent(901)));
 }
 
 } // namespace journal
@@ -184,6 +315,36 @@ std::ostream &operator<<(std::ostream &out,
   case EVENT_TYPE_AIO_FLUSH:
     out << "AioFlush";
     break;
+  case EVENT_TYPE_OP_FINISH:
+    out << "OpFinish";
+    break;
+  case EVENT_TYPE_SNAP_CREATE:
+    out << "SnapCreate";
+    break;
+  case EVENT_TYPE_SNAP_REMOVE:
+    out << "SnapRemove";
+    break;
+  case EVENT_TYPE_SNAP_RENAME:
+    out << "SnapRename";
+    break;
+  case EVENT_TYPE_SNAP_PROTECT:
+    out << "SnapProtect";
+    break;
+  case EVENT_TYPE_SNAP_UNPROTECT:
+    out << "SnapUnprotect";
+    break;
+  case EVENT_TYPE_SNAP_ROLLBACK:
+    out << "SnapRollback";
+    break;
+  case EVENT_TYPE_RENAME:
+    out << "Rename";
+    break;
+  case EVENT_TYPE_RESIZE:
+    out << "Resize";
+    break;
+  case EVENT_TYPE_FLATTEN:
+    out << "Flatten";
+    break;
   default:
     out << "Unknown (" << static_cast<uint32_t>(type) << ")";
     break;
diff --git a/src/librbd/JournalTypes.h b/src/librbd/JournalTypes.h
index 59bd13f..ce8ff03 100644
--- a/src/librbd/JournalTypes.h
+++ b/src/librbd/JournalTypes.h
@@ -7,6 +7,7 @@
 #include "include/int_types.h"
 #include "include/buffer.h"
 #include "include/encoding.h"
+#include "include/types.h"
 #include <iosfwd>
 #include <boost/variant.hpp>
 
@@ -18,9 +19,19 @@ namespace librbd {
 namespace journal {
 
 enum EventType {
-  EVENT_TYPE_AIO_DISCARD = 0,
-  EVENT_TYPE_AIO_WRITE   = 1,
-  EVENT_TYPE_AIO_FLUSH   = 2
+  EVENT_TYPE_AIO_DISCARD    = 0,
+  EVENT_TYPE_AIO_WRITE      = 1,
+  EVENT_TYPE_AIO_FLUSH      = 2,
+  EVENT_TYPE_OP_FINISH      = 3,
+  EVENT_TYPE_SNAP_CREATE    = 4,
+  EVENT_TYPE_SNAP_REMOVE    = 5,
+  EVENT_TYPE_SNAP_RENAME    = 6,
+  EVENT_TYPE_SNAP_PROTECT   = 7,
+  EVENT_TYPE_SNAP_UNPROTECT = 8,
+  EVENT_TYPE_SNAP_ROLLBACK  = 9,
+  EVENT_TYPE_RENAME         = 10,
+  EVENT_TYPE_RESIZE         = 11,
+  EVENT_TYPE_FLATTEN        = 12
 };
 
 struct AioDiscardEvent {
@@ -58,16 +69,172 @@ struct AioWriteEvent {
   void dump(Formatter *f) const;
 };
 
-struct UnknownEvent {
-  static const EventType EVENT_TYPE = static_cast<EventType>(-1);
+struct AioFlushEvent {
+  static const EventType EVENT_TYPE = EVENT_TYPE_AIO_FLUSH;
 
   void encode(bufferlist& bl) const;
   void decode(__u8 version, bufferlist::iterator& it);
   void dump(Formatter *f) const;
 };
 
-struct AioFlushEvent {
-  static const EventType EVENT_TYPE = EVENT_TYPE_AIO_FLUSH;
+struct OpEventBase {
+  uint64_t tid;
+
+  virtual void encode(bufferlist& bl) const;
+  virtual void decode(__u8 version, bufferlist::iterator& it);
+  virtual void dump(Formatter *f) const;
+
+protected:
+  OpEventBase() : tid(0) {
+  }
+  OpEventBase(uint64_t _tid) : tid(_tid) {
+  }
+  virtual ~OpEventBase() {}
+};
+
+struct OpStartEventBase : public OpEventBase {
+protected:
+  OpStartEventBase() {
+  }
+  OpStartEventBase(uint64_t tid) : OpEventBase(tid) {
+  }
+};
+
+struct OpFinishEvent : public OpEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_OP_FINISH;
+
+  int r;
+
+  OpFinishEvent() : r(0) {
+  }
+  OpFinishEvent(uint64_t tid, int _r) : OpEventBase(tid), r(_r) {
+  }
+};
+
+struct SnapEventBase : public OpStartEventBase {
+  std::string snap_name;
+
+  SnapEventBase() {
+  }
+  SnapEventBase(uint64_t tid, const std::string &_snap_name)
+    : OpStartEventBase(tid), snap_name(_snap_name) {
+  }
+
+  virtual void encode(bufferlist& bl) const;
+  virtual void decode(__u8 version, bufferlist::iterator& it);
+  virtual void dump(Formatter *f) const;
+};
+
+struct SnapCreateEvent : public SnapEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_SNAP_CREATE;
+
+  SnapCreateEvent() {
+  }
+  SnapCreateEvent(uint64_t tid, const std::string &snap_name)
+    : SnapEventBase(tid, snap_name) {
+  }
+};
+
+struct SnapRemoveEvent : public SnapEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_SNAP_REMOVE;
+
+  SnapRemoveEvent() {
+  }
+  SnapRemoveEvent(uint64_t tid, const std::string &snap_name)
+    : SnapEventBase(tid, snap_name) {
+  }
+};
+
+struct SnapRenameEvent : public SnapEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_SNAP_RENAME;
+
+  uint64_t snap_id;
+
+  SnapRenameEvent() : snap_id(CEPH_NOSNAP) {
+  }
+  SnapRenameEvent(uint64_t tid, uint64_t src_snap_id,
+                  const std::string &dest_snap_name)
+    : SnapEventBase(tid, dest_snap_name), snap_id(src_snap_id) {
+  }
+
+  virtual void encode(bufferlist& bl) const;
+  virtual void decode(__u8 version, bufferlist::iterator& it);
+  virtual void dump(Formatter *f) const;
+};
+
+struct SnapProtectEvent : public SnapEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_SNAP_PROTECT;
+
+  SnapProtectEvent() {
+  }
+  SnapProtectEvent(uint64_t tid, const std::string &snap_name)
+    : SnapEventBase(tid, snap_name) {
+  }
+};
+
+struct SnapUnprotectEvent : public SnapEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_SNAP_UNPROTECT;
+
+  SnapUnprotectEvent() {
+  }
+  SnapUnprotectEvent(uint64_t tid, const std::string &snap_name)
+    : SnapEventBase(tid, snap_name) {
+  }
+};
+
+struct SnapRollbackEvent : public SnapEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_SNAP_ROLLBACK;
+
+  SnapRollbackEvent() {
+  }
+  SnapRollbackEvent(uint64_t tid, const std::string &snap_name)
+    : SnapEventBase(tid, snap_name) {
+  }
+};
+
+struct RenameEvent : public OpStartEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_RENAME;
+
+  std::string image_name;
+
+  RenameEvent() {
+  }
+  RenameEvent(uint64_t tid, const std::string &_image_name)
+    : OpStartEventBase(tid), image_name(_image_name) {
+  }
+
+  virtual void encode(bufferlist& bl) const;
+  virtual void decode(__u8 version, bufferlist::iterator& it);
+  virtual void dump(Formatter *f) const;
+};
+
+struct ResizeEvent : public OpStartEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_RESIZE;
+
+  uint64_t size;
+
+  ResizeEvent() : size(0) {
+  }
+  ResizeEvent(uint64_t tid, uint64_t _size)
+    : OpStartEventBase(tid), size(_size) {
+  }
+
+  virtual void encode(bufferlist& bl) const;
+  virtual void decode(__u8 version, bufferlist::iterator& it);
+  virtual void dump(Formatter *f) const;
+};
+
+struct FlattenEvent : public OpStartEventBase {
+  static const EventType EVENT_TYPE = EVENT_TYPE_FLATTEN;
+
+  FlattenEvent() {
+  }
+  FlattenEvent(uint64_t tid) : OpStartEventBase(tid) {
+  }
+};
+
+struct UnknownEvent {
+  static const EventType EVENT_TYPE = static_cast<EventType>(-1);
 
   void encode(bufferlist& bl) const;
   void decode(__u8 version, bufferlist::iterator& it);
@@ -77,6 +244,16 @@ struct AioFlushEvent {
 typedef boost::variant<AioDiscardEvent,
                        AioWriteEvent,
                        AioFlushEvent,
+                       OpFinishEvent,
+                       SnapCreateEvent,
+                       SnapRemoveEvent,
+                       SnapRenameEvent,
+                       SnapProtectEvent,
+                       SnapUnprotectEvent,
+                       SnapRollbackEvent,
+                       RenameEvent,
+                       ResizeEvent,
+                       FlattenEvent,
                        UnknownEvent> Event;
 
 struct EventEntry {
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc
index 1e68214..763860b 100644
--- a/src/librbd/LibrbdWriteback.cc
+++ b/src/librbd/LibrbdWriteback.cc
@@ -5,19 +5,21 @@
 
 #include "common/ceph_context.h"
 #include "common/dout.h"
-#include "common/Finisher.h"
 #include "common/Mutex.h"
+#include "common/WorkQueue.h"
 #include "include/Context.h"
 #include "include/rados/librados.hpp"
 #include "include/rbd/librbd.hpp"
 
 #include "librbd/AioObjectRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
 #include "librbd/LibrbdWriteback.h"
 #include "librbd/AioCompletion.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/Journal.h"
+#include "librbd/Utils.h"
 
 #include "include/assert.h"
 
@@ -120,7 +122,7 @@ namespace librbd {
 
     virtual void complete(int r) {
       if (request_sent || r < 0) {
-        commit_event_extent(r);
+        commit_io_event_extent(r);
         req_comp->complete(r);
         delete this;
       } else {
@@ -131,7 +133,7 @@ namespace librbd {
     virtual void finish(int r) {
     }
 
-    void commit_event_extent(int r) {
+    void commit_io_event_extent(int r) {
       CephContext *cct = image_ctx->cct;
       ldout(cct, 20) << this << " C_WriteJournalCommit: "
                      << "write committed: updating journal commit position"
@@ -145,8 +147,8 @@ namespace librbd {
                               bl.length(), file_extents);
       for (Extents::iterator it = file_extents.begin();
            it != file_extents.end(); ++it) {
-        image_ctx->journal->commit_event_extent(journal_tid, it->first,
-                                                it->second, r);
+        image_ctx->journal->commit_io_event_extent(journal_tid, it->first,
+                                                   it->second, r);
       }
     }
 
@@ -156,7 +158,7 @@ namespace librbd {
                      << "journal committed: sending write request" << dendl;
 
       RWLock::RLocker owner_locker(image_ctx->owner_lock);
-      assert(image_ctx->image_watcher->is_lock_owner());
+      assert(image_ctx->exclusive_lock->is_lock_owner());
 
       request_sent = true;
       AioObjectWrite *req = new AioObjectWrite(image_ctx, oid, object_no, off,
@@ -166,14 +168,7 @@ namespace librbd {
   };
 
   LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock)
-    : m_finisher(new Finisher(ictx->cct)), m_tid(0), m_lock(lock), m_ictx(ictx)
-  {
-    m_finisher->start();
-  }
-
-  LibrbdWriteback::~LibrbdWriteback() {
-    m_finisher->stop();
-    delete m_finisher;
+    : m_tid(0), m_lock(lock), m_ictx(ictx) {
   }
 
   void LibrbdWriteback::read(const object_t& oid, uint64_t object_no,
@@ -187,18 +182,21 @@ namespace librbd {
                                      &m_lock);
 
     {
-      if (!m_ictx->object_map.object_may_exist(object_no)) {
-	m_finisher->queue(req, -ENOENT);
+      RWLock::RLocker snap_locker(m_ictx->snap_lock);
+      if (m_ictx->object_map != nullptr &&
+          !m_ictx->object_map->object_may_exist(object_no)) {
+        m_ictx->op_work_queue->queue(req, -ENOENT);
 	return;
       }
     }
 
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(req, context_cb, NULL);
     librados::ObjectReadOperation op;
     op.read(off, len, pbl, NULL);
     op.set_op_flags2(op_flags);
     int flags = m_ictx->get_read_flags(snapid);
+
+    librados::AioCompletion *rados_completion =
+      util::create_rados_ack_callback(req);
     int r = m_ictx->data_ctx.aio_operate(oid.name, rados_completion, &op,
 					 flags, NULL);
     rados_completion->release();
@@ -275,8 +273,8 @@ namespace librbd {
                             len, file_extents);
     for (Extents::iterator it = file_extents.begin();
          it != file_extents.end(); ++it) {
-      m_ictx->journal->commit_event_extent(journal_tid, it->first, it->second,
-                                           0);
+      m_ictx->journal->commit_io_event_extent(journal_tid, it->first,
+                                              it->second, 0);
     }
   }
 
diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h
index b7574ae..11b46cf 100644
--- a/src/librbd/LibrbdWriteback.h
+++ b/src/librbd/LibrbdWriteback.h
@@ -11,7 +11,6 @@
 #include "osd/osd_types.h"
 #include "osdc/WritebackHandler.h"
 
-class Finisher;
 class Mutex;
 
 namespace librbd {
@@ -21,7 +20,6 @@ namespace librbd {
   class LibrbdWriteback : public WritebackHandler {
   public:
     LibrbdWriteback(ImageCtx *ictx, Mutex& lock);
-    virtual ~LibrbdWriteback();
 
     // Note that oloc, trunc_size, and trunc_seq are ignored
     virtual void read(const object_t& oid, uint64_t object_no,
@@ -61,7 +59,6 @@ namespace librbd {
   private:
     void complete_writes(const std::string& oid);
 
-    Finisher *m_finisher;
     ceph_tid_t m_tid;
     Mutex& m_lock;
     librbd::ImageCtx *m_ictx;
diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am
index 72268a7..153b69e 100644
--- a/src/librbd/Makefile.am
+++ b/src/librbd/Makefile.am
@@ -12,15 +12,14 @@ librbd_internal_la_SOURCES = \
 	librbd/AioImageRequest.cc \
 	librbd/AioImageRequestWQ.cc \
 	librbd/AioObjectRequest.cc \
-	librbd/AsyncFlattenRequest.cc \
 	librbd/AsyncObjectThrottle.cc \
 	librbd/AsyncOperation.cc \
 	librbd/AsyncRequest.cc \
-	librbd/AsyncResizeRequest.cc \
-	librbd/AsyncTrimRequest.cc \
 	librbd/CopyupRequest.cc \
 	librbd/DiffIterate.cc \
+	librbd/ExclusiveLock.cc \
 	librbd/ImageCtx.cc \
+	librbd/ImageState.cc \
 	librbd/ImageWatcher.cc \
 	librbd/internal.cc \
 	librbd/Journal.cc \
@@ -28,7 +27,36 @@ librbd_internal_la_SOURCES = \
 	librbd/LibrbdAdminSocketHook.cc \
 	librbd/LibrbdWriteback.cc \
 	librbd/ObjectMap.cc \
-	librbd/RebuildObjectMapRequest.cc
+	librbd/Utils.cc \
+	librbd/exclusive_lock/AcquireRequest.cc \
+	librbd/exclusive_lock/ReleaseRequest.cc \
+	librbd/image/CloseRequest.cc \
+	librbd/image/OpenRequest.cc \
+	librbd/image/RefreshParentRequest.cc \
+	librbd/image/RefreshRequest.cc \
+	librbd/image/SetSnapRequest.cc \
+	librbd/object_map/InvalidateRequest.cc \
+	librbd/object_map/LockRequest.cc \
+	librbd/object_map/Request.cc \
+	librbd/object_map/RefreshRequest.cc \
+	librbd/object_map/ResizeRequest.cc \
+	librbd/object_map/SnapshotCreateRequest.cc \
+	librbd/object_map/SnapshotRemoveRequest.cc \
+	librbd/object_map/SnapshotRollbackRequest.cc \
+	librbd/object_map/UnlockRequest.cc \
+	librbd/object_map/UpdateRequest.cc \
+	librbd/operation/FlattenRequest.cc \
+	librbd/operation/RebuildObjectMapRequest.cc \
+	librbd/operation/RenameRequest.cc \
+	librbd/operation/Request.cc \
+	librbd/operation/ResizeRequest.cc \
+	librbd/operation/SnapshotCreateRequest.cc \
+	librbd/operation/SnapshotProtectRequest.cc \
+	librbd/operation/SnapshotRemoveRequest.cc \
+	librbd/operation/SnapshotRenameRequest.cc \
+	librbd/operation/SnapshotRollbackRequest.cc \
+	librbd/operation/SnapshotUnprotectRequest.cc \
+	librbd/operation/TrimRequest.cc
 noinst_LTLIBRARIES += librbd_internal.la
 
 librbd_api_la_SOURCES = \
@@ -58,15 +86,14 @@ noinst_HEADERS += \
 	librbd/AioImageRequest.h \
 	librbd/AioImageRequestWQ.h \
 	librbd/AioObjectRequest.h \
-	librbd/AsyncFlattenRequest.h \
 	librbd/AsyncObjectThrottle.h \
 	librbd/AsyncOperation.h \
 	librbd/AsyncRequest.h \
-	librbd/AsyncResizeRequest.h \
-	librbd/AsyncTrimRequest.h \
 	librbd/CopyupRequest.h \
 	librbd/DiffIterate.h \
+	librbd/ExclusiveLock.h \
 	librbd/ImageCtx.h \
+	librbd/ImageState.h \
 	librbd/ImageWatcher.h \
 	librbd/internal.h \
 	librbd/Journal.h \
@@ -76,10 +103,39 @@ noinst_HEADERS += \
 	librbd/LibrbdWriteback.h \
 	librbd/ObjectMap.h \
 	librbd/parent_types.h \
-	librbd/RebuildObjectMapRequest.h \
 	librbd/SnapInfo.h \
 	librbd/TaskFinisher.h \
-	librbd/WatchNotifyTypes.h
+	librbd/Utils.h \
+	librbd/WatchNotifyTypes.h \
+	librbd/exclusive_lock/AcquireRequest.h \
+	librbd/exclusive_lock/ReleaseRequest.h \
+	librbd/image/CloseRequest.h \
+	librbd/image/OpenRequest.h \
+	librbd/image/RefreshParentRequest.h \
+	librbd/image/RefreshRequest.h \
+	librbd/image/SetSnapRequest.h \
+	librbd/object_map/InvalidateRequest.h \
+	librbd/object_map/LockRequest.h \
+	librbd/object_map/Request.h \
+	librbd/object_map/RefreshRequest.h \
+	librbd/object_map/ResizeRequest.h \
+	librbd/object_map/SnapshotCreateRequest.h \
+	librbd/object_map/SnapshotRemoveRequest.h \
+	librbd/object_map/SnapshotRollbackRequest.h \
+	librbd/object_map/UnlockRequest.h \
+	librbd/object_map/UpdateRequest.h \
+	librbd/operation/FlattenRequest.h \
+	librbd/operation/RebuildObjectMapRequest.h \
+	librbd/operation/RenameRequest.h \
+	librbd/operation/Request.h \
+	librbd/operation/ResizeRequest.h \
+	librbd/operation/SnapshotCreateRequest.h \
+	librbd/operation/SnapshotProtectRequest.h \
+	librbd/operation/SnapshotRemoveRequest.h \
+	librbd/operation/SnapshotRenameRequest.h \
+	librbd/operation/SnapshotRollbackRequest.h \
+	librbd/operation/SnapshotUnprotectRequest.h \
+	librbd/operation/TrimRequest.h
 
 endif # WITH_RBD
 endif # WITH_RADOS
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
index e1f2e17..4f3f2f4 100644
--- a/src/librbd/ObjectMap.cc
+++ b/src/librbd/ObjectMap.cc
@@ -1,9 +1,20 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 #include "librbd/ObjectMap.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "librbd/object_map/LockRequest.h"
+#include "librbd/object_map/RefreshRequest.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "librbd/object_map/UnlockRequest.h"
+#include "librbd/object_map/UpdateRequest.h"
+#include "librbd/Utils.h"
 #include "common/dout.h"
 #include "common/errno.h"
 #include "include/stringify.h"
@@ -16,8 +27,8 @@
 
 namespace librbd {
 
-ObjectMap::ObjectMap(ImageCtx &image_ctx)
-  : m_image_ctx(image_ctx), m_snap_id(CEPH_NOSNAP), m_enabled(false)
+ObjectMap::ObjectMap(ImageCtx &image_ctx, uint64_t snap_id)
+  : m_image_ctx(image_ctx), m_snap_id(snap_id)
 {
 }
 
@@ -51,108 +62,19 @@ uint8_t ObjectMap::operator[](uint64_t object_no) const
   return m_object_map[object_no];
 }
 
-bool ObjectMap::enabled() const
-{
-  RWLock::RLocker l(m_image_ctx.object_map_lock);
-  return m_enabled;
-}
-
-int ObjectMap::lock()
-{
-  if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
-    return 0;
-  }
-
-  {
-    RWLock::RLocker l(m_image_ctx.object_map_lock);
-    if (!m_enabled) {
-      return 0;
-    }
-  }
-
-  bool broke_lock = false;
-  CephContext *cct = m_image_ctx.cct;
-  std::string oid(object_map_name(m_image_ctx.id, CEPH_NOSNAP));
-  while (true) {
-    int r;
-    ldout(cct, 10) << &m_image_ctx << " locking object map" << dendl;
-    r = rados::cls::lock::lock(&m_image_ctx.md_ctx, oid,
-			       RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "", "",
-			       utime_t(), 0);
-    if (r == 0) {
-      break;
-    } else if (broke_lock || r != -EBUSY) {
-      lderr(cct) << "failed to lock object map: " << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    typedef std::map<rados::cls::lock::locker_id_t,
-                     rados::cls::lock::locker_info_t> lockers_t;
-    lockers_t lockers;
-    ClsLockType lock_type;
-    std::string lock_tag;
-    r = rados::cls::lock::get_lock_info(&m_image_ctx.md_ctx, oid,
-                                        RBD_LOCK_NAME, &lockers,
-                                        &lock_type, &lock_tag);
-    if (r == -ENOENT) {
-      continue;
-    } else if (r < 0) {
-      lderr(cct) << "failed to list object map locks: " << cpp_strerror(r)
-                 << dendl;
-      return r;
-    }
-
-    ldout(cct, 10) << "breaking current object map lock" << dendl;
-    for (lockers_t::iterator it = lockers.begin();
-         it != lockers.end(); ++it) {
-      const rados::cls::lock::locker_id_t &locker = it->first;
-      r = rados::cls::lock::break_lock(&m_image_ctx.md_ctx, oid,
-                                       RBD_LOCK_NAME, locker.cookie,
-                                       locker.locker);
-      if (r < 0 && r != -ENOENT) {
-        lderr(cct) << "failed to break object map lock: " << cpp_strerror(r)
-                   << dendl;
-        return r;
-      }
-    }
-
-
-
-    broke_lock = true;
-  }
-  return 0;
-}
-
-int ObjectMap::unlock()
-{
-  if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
-    return 0;
-  }
-
-  ldout(m_image_ctx.cct, 10) << &m_image_ctx << " unlocking object map"
-			     << dendl;
-  std::string oid = object_map_name(m_image_ctx.id, CEPH_NOSNAP);
-  int r = rados::cls::lock::unlock(&m_image_ctx.md_ctx, oid,
-                                   RBD_LOCK_NAME, "");
-  if (r < 0 && r != -ENOENT) {
-    lderr(m_image_ctx.cct) << "failed to release object map lock: "
-			   << cpp_strerror(r) << dendl;
-  }
-  return r;
-}
-
 bool ObjectMap::object_may_exist(uint64_t object_no) const
 {
+  assert(m_image_ctx.snap_lock.is_locked());
+
   // Fall back to default logic if object map is disabled or invalid
-  if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP) ||
-      m_image_ctx.test_flags(RBD_FLAG_OBJECT_MAP_INVALID)) {
+  if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+                                 m_image_ctx.snap_lock) ||
+      m_image_ctx.test_flags(RBD_FLAG_OBJECT_MAP_INVALID,
+                             m_image_ctx.snap_lock)) {
     return true;
   }
 
   RWLock::RLocker l(m_image_ctx.object_map_lock);
-  if (!m_enabled) {
-    return true;
-  }
   uint8_t state = (*this)[object_no];
   bool exists = (state != OBJECT_NONEXISTENT);
   ldout(m_image_ctx.cct, 20) << &m_image_ctx << " object_may_exist: "
@@ -161,257 +83,62 @@ bool ObjectMap::object_may_exist(uint64_t object_no) const
   return exists;
 }
 
-void ObjectMap::refresh(uint64_t snap_id)
-{
-  assert(m_image_ctx.snap_lock.is_wlocked());
-  RWLock::WLocker l(m_image_ctx.object_map_lock);
-  m_snap_id = snap_id;
-
-  if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0 ||
-      (m_image_ctx.snap_id == snap_id && !m_image_ctx.snap_exists)) {
-    m_object_map.clear();
-    m_enabled = false;
-    return;
-  }
-  m_enabled = true;
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 10) << &m_image_ctx << " refreshing object map" << dendl;
-
-  uint64_t num_objs = Striper::get_num_objects(
-    m_image_ctx.layout, m_image_ctx.get_image_size(snap_id));
-
-  std::string oid(object_map_name(m_image_ctx.id, snap_id));
-  int r = cls_client::object_map_load(&m_image_ctx.md_ctx, oid,
-                                      &m_object_map);
-  if (r == -EINVAL) {
-    // object map is corrupt on-disk -- clear it and properly size it
-    // so future IO can keep the object map in sync
-    invalidate(snap_id, false);
-
-    librados::ObjectWriteOperation op;
-    if (snap_id == CEPH_NOSNAP) {
-      rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "",
-                                      "");
-    }
-    op.truncate(0);
-    cls_client::object_map_resize(&op, num_objs, OBJECT_NONEXISTENT);
-
-    r = m_image_ctx.md_ctx.operate(oid, &op);
-    if (r == 0) {
-      m_object_map.clear();
-      resize(num_objs, OBJECT_NONEXISTENT);
-    }
-  }
-  if (r < 0) {
-    lderr(cct) << "error refreshing object map: " << cpp_strerror(r)
-               << dendl;
-    invalidate(snap_id, false);
-    m_object_map.clear();
-    return;
-  }
-
-  ldout(cct, 20) << "refreshed object map: " << m_object_map.size()
-                 << dendl;
-
-  if (m_object_map.size() < num_objs) {
-    lderr(cct) << "object map smaller than current object count: "
-               << m_object_map.size() << " != " << num_objs << dendl;
-    invalidate(snap_id, false);
-
-    // correct the size issue so future IO can keep the object map in sync
-    librados::ObjectWriteOperation op;
-    if (snap_id == CEPH_NOSNAP) {
-      rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "",
-                                      "");
-    }
-    cls_client::object_map_resize(&op, num_objs, OBJECT_NONEXISTENT);
-
-    r = m_image_ctx.md_ctx.operate(oid, &op);
-    if (r == 0) {
-      resize(num_objs, OBJECT_NONEXISTENT);
-    }
-  } else if (m_object_map.size() > num_objs) {
-    // resize op might have been interrupted
-    ldout(cct, 1) << "object map larger than current object count: "
-                  << m_object_map.size() << " != " << num_objs << dendl;
-  }
+void ObjectMap::open(Context *on_finish) {
+  object_map::RefreshRequest<> *req = new object_map::RefreshRequest<>(
+    m_image_ctx, &m_object_map, m_snap_id, on_finish);
+  req->send();
 }
 
-void ObjectMap::rollback(uint64_t snap_id) {
-  assert(m_image_ctx.snap_lock.is_wlocked());
-  int r;
-  std::string oid(object_map_name(m_image_ctx.id, CEPH_NOSNAP));
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 10) << &m_image_ctx << " rollback object map" << dendl;
-
-  if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0) {
-    r = m_image_ctx.md_ctx.remove(oid);
-    if (r < 0 && r != -ENOENT) {
-      lderr(cct) << "unable to remove object map: " << cpp_strerror(r)
-		 << dendl;
-    }
-    return;
-  }
-
-  RWLock::WLocker l(m_image_ctx.object_map_lock);
-  if (!m_enabled) {
-    return;
-  }
-
-  std::string snap_oid(object_map_name(m_image_ctx.id, snap_id));
-  bufferlist bl;
-  r = m_image_ctx.md_ctx.read(snap_oid, bl, 0, 0);
-  if (r < 0) {
-    lderr(cct) << "unable to load snapshot object map '" << snap_oid << "': "
-	       << cpp_strerror(r) << dendl;
-    invalidate(snap_id, false);
-    return;
-  }
-
-  librados::ObjectWriteOperation op;
-  rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
-  op.write_full(bl);
-
-  r = m_image_ctx.md_ctx.operate(oid, &op);
-  if (r < 0) {
-    lderr(cct) << "unable to rollback object map: " << cpp_strerror(r)
-	       << dendl;
-    invalidate(CEPH_NOSNAP, true);
-  }
+void ObjectMap::lock(Context *on_finish) {
+  assert(m_snap_id == CEPH_NOSNAP);
+  object_map::LockRequest<> *req = new object_map::LockRequest<>(
+    m_image_ctx, on_finish);
+  req->send();
 }
 
-void ObjectMap::snapshot_add(uint64_t snap_id) {
-  assert(m_image_ctx.snap_lock.is_wlocked());
-  if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0) {
-    return;
-  }
-
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 10) << &m_image_ctx << " snapshot object map" << dendl;
+void ObjectMap::unlock(Context *on_finish) {
+  assert(m_snap_id == CEPH_NOSNAP);
+  object_map::UnlockRequest<> *req = new object_map::UnlockRequest<>(
+    m_image_ctx, on_finish);
+  req->send();
+}
 
-  int r;
-  bufferlist bl;
-  RWLock::WLocker l(m_image_ctx.object_map_lock);
-  if (!m_enabled) {
-    return;
-  }
-  std::string oid(object_map_name(m_image_ctx.id, CEPH_NOSNAP));
-  r = m_image_ctx.md_ctx.read(oid, bl, 0, 0);
-  if (r < 0) {
-    lderr(cct) << "unable to load object map: " << cpp_strerror(r)
-	       << dendl;
-    invalidate(CEPH_NOSNAP, false);
-    return;
-  }
+void ObjectMap::rollback(uint64_t snap_id, Context *on_finish) {
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.object_map_lock.is_wlocked());
 
-  std::string snap_oid(object_map_name(m_image_ctx.id, snap_id));
-  r = m_image_ctx.md_ctx.write_full(snap_oid, bl);
-  if (r < 0) {
-    lderr(cct) << "unable to snapshot object map '" << snap_oid << "': "
-	       << cpp_strerror(r) << dendl;
-    invalidate(snap_id, false);
-    return;
-  }
+  object_map::SnapshotRollbackRequest *req =
+    new object_map::SnapshotRollbackRequest(m_image_ctx, snap_id, on_finish);
+  req->send();
+}
 
-  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
-    librados::ObjectWriteOperation op;
-    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
-    cls_client::object_map_snap_add(&op);
-    r = m_image_ctx.md_ctx.operate(oid, &op);
-    if (r < 0) {
-      lderr(cct) << "unable to snapshot object map: " << cpp_strerror(r)
-                 << dendl;
-      invalidate(CEPH_NOSNAP, true);
-      return;
-    }
+void ObjectMap::snapshot_add(uint64_t snap_id, Context *on_finish) {
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
+  assert(snap_id != CEPH_NOSNAP);
 
-    for (uint64_t i = 0; i < m_object_map.size(); ++i) {
-      if (m_object_map[i] == OBJECT_EXISTS) {
-        m_object_map[i] = OBJECT_EXISTS_CLEAN;
-      }
-    }
-  }
+  object_map::SnapshotCreateRequest *req =
+    new object_map::SnapshotCreateRequest(m_image_ctx, &m_object_map, snap_id,
+                                          on_finish);
+  req->send();
 }
 
-int ObjectMap::snapshot_remove(uint64_t snap_id) {
+void ObjectMap::snapshot_remove(uint64_t snap_id, Context *on_finish) {
   assert(m_image_ctx.snap_lock.is_wlocked());
+  assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
   assert(snap_id != CEPH_NOSNAP);
-  CephContext *cct = m_image_ctx.cct;
-
-  int r;
-  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
-    RWLock::WLocker l(m_image_ctx.object_map_lock);
-
-    uint64_t next_snap_id = CEPH_NOSNAP;
-    std::map<librados::snap_t, SnapInfo>::const_iterator it =
-      m_image_ctx.snap_info.find(snap_id);
-    assert(it != m_image_ctx.snap_info.end());
-
-    ++it;
-    if (it != m_image_ctx.snap_info.end()) {
-      next_snap_id = it->first;
-    }
-
-    ceph::BitVector<2> snap_object_map;
-    std::string snap_oid(object_map_name(m_image_ctx.id, snap_id));
-    r = cls_client::object_map_load(&m_image_ctx.md_ctx, snap_oid,
-                                    &snap_object_map);
-    if (r < 0) {
-      lderr(cct) << "error loading snapshot object map: " << cpp_strerror(r)
-                 << dendl;
-    }
-
-    if (r == 0) {
-      uint64_t flags;
-      m_image_ctx.get_flags(snap_id, &flags);
-      if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
-        invalidate(next_snap_id, true);
-        r = -EINVAL;
-      }
-    }
-
-    if (r == 0) {
-      std::string oid(object_map_name(m_image_ctx.id, next_snap_id));
-      librados::ObjectWriteOperation op;
-      if (next_snap_id == CEPH_NOSNAP) {
-        rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "",
-                                        "");
-      }
-      cls_client::object_map_snap_remove(&op, snap_object_map);
 
-      r = m_image_ctx.md_ctx.operate(oid, &op);
-      if (r < 0) {
-        lderr(cct) << "unable to remove object map snapshot: "
-                   << cpp_strerror(r) << dendl;
-        invalidate(next_snap_id, true);
-      }
-    }
-
-    if (r == 0 && next_snap_id == CEPH_NOSNAP) {
-      for (uint64_t i = 0; i < m_object_map.size(); ++i) {
-        if (m_object_map[i] == OBJECT_EXISTS_CLEAN &&
-            (i >= snap_object_map.size() ||
-             snap_object_map[i] == OBJECT_EXISTS)) {
-          m_object_map[i] = OBJECT_EXISTS;
-        }
-      }
-    }
-  }
-
-  std::string oid(object_map_name(m_image_ctx.id, snap_id));
-  r = m_image_ctx.md_ctx.remove(oid);
-  if (r < 0 && r != -ENOENT) {
-    return r;
-  }
-  return 0;
+  object_map::SnapshotRemoveRequest *req =
+    new object_map::SnapshotRemoveRequest(m_image_ctx, &m_object_map, snap_id,
+                                          on_finish);
+  req->send();
 }
 
-void ObjectMap::aio_save(Context *on_finish)
-{
-  assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP));
+void ObjectMap::aio_save(Context *on_finish) {
   assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+                                   m_image_ctx.snap_lock));
   RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock);
 
   librados::ObjectWriteOperation op;
@@ -421,8 +148,7 @@ void ObjectMap::aio_save(Context *on_finish)
   cls_client::object_map_save(&op, m_object_map);
 
   std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
-  librados::AioCompletion *comp = librados::Rados::aio_create_completion(
-    on_finish, NULL, rados_ctx_cb);
+  librados::AioCompletion *comp = util::create_rados_safe_callback(on_finish);
 
   int r = m_image_ctx.md_ctx.aio_operate(oid, comp, &op);
   assert(r == 0);
@@ -431,14 +157,17 @@ void ObjectMap::aio_save(Context *on_finish)
 
 void ObjectMap::aio_resize(uint64_t new_size, uint8_t default_object_state,
 			   Context *on_finish) {
-  assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP));
   assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+                                   m_image_ctx.snap_lock));
   assert(m_image_ctx.image_watcher != NULL);
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
+  assert(m_image_ctx.exclusive_lock == nullptr ||
+         m_image_ctx.exclusive_lock->is_lock_owner());
 
-  ResizeRequest *req = new ResizeRequest(
-    m_image_ctx, m_snap_id, new_size, default_object_state, on_finish);
+  object_map::ResizeRequest *req = new object_map::ResizeRequest(
+    m_image_ctx, &m_object_map, m_snap_id, new_size, default_object_state,
+    on_finish);
   req->send();
 }
 
@@ -459,8 +188,8 @@ bool ObjectMap::aio_update(uint64_t start_object_no, uint64_t end_object_no,
   assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
   assert(m_image_ctx.owner_lock.is_locked());
   assert(m_image_ctx.image_watcher != NULL);
-  assert(!m_image_ctx.image_watcher->is_lock_supported(m_image_ctx.snap_lock) ||
-         m_image_ctx.image_watcher->is_lock_owner());
+  assert(m_image_ctx.exclusive_lock == nullptr ||
+         m_image_ctx.exclusive_lock->is_lock_owner());
   assert(m_image_ctx.object_map_lock.is_wlocked());
   assert(start_object_no < end_object_no);
 
@@ -493,214 +222,10 @@ void ObjectMap::aio_update(uint64_t snap_id, uint64_t start_object_no,
                            uint64_t end_object_no, uint8_t new_state,
                            const boost::optional<uint8_t> &current_state,
                            Context *on_finish) {
-  UpdateRequest *req = new UpdateRequest(m_image_ctx, snap_id,
-                                         start_object_no, end_object_no,
-                                         new_state, current_state,
-                                         on_finish);
+  object_map::UpdateRequest *req = new object_map::UpdateRequest(
+    m_image_ctx, &m_object_map, snap_id, start_object_no, end_object_no,
+    new_state, current_state, on_finish);
   req->send();
 }
 
-void ObjectMap::invalidate(uint64_t snap_id, bool force) {
-  assert(m_image_ctx.snap_lock.is_wlocked());
-  assert(m_image_ctx.object_map_lock.is_wlocked());
-  uint64_t flags;
-  m_image_ctx.get_flags(snap_id, &flags);
-  if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
-    return;
-  }
-
-  flags = RBD_FLAG_OBJECT_MAP_INVALID;
-  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
-    flags |= RBD_FLAG_FAST_DIFF_INVALID;
-  }
-
-  CephContext *cct = m_image_ctx.cct;
-  lderr(cct) << &m_image_ctx << " invalidating object map" << dendl;
-  int r = m_image_ctx.update_flags(snap_id, flags, true);
-  if (r < 0) {
-    lderr(cct) << "failed to invalidate in-memory object map: "
-               << cpp_strerror(r) << dendl;
-    return;
-  }
-
-  // do not update on-disk flags if not image owner
-  if (m_image_ctx.image_watcher == NULL ||
-      (m_image_ctx.image_watcher->is_lock_supported(m_image_ctx.snap_lock) &&
-       !m_image_ctx.image_watcher->is_lock_owner())) {
-    return;
-  }
-
-  librados::ObjectWriteOperation op;
-  if (snap_id == CEPH_NOSNAP && !force) {
-    m_image_ctx.image_watcher->assert_header_locked(&op);
-  }
-  cls_client::set_flags(&op, snap_id, flags, flags);
-
-  r = m_image_ctx.md_ctx.operate(m_image_ctx.header_oid, &op);
-  if (r == -EBUSY) {
-    ldout(cct, 5) << "skipping on-disk object map invalidation: "
-                  << "image not locked by client" << dendl;
-  } else if (r < 0) {
-    lderr(cct) << "failed to invalidate on-disk object map: " << cpp_strerror(r)
-	       << dendl;
-  }
-}
-
-void ObjectMap::resize(uint64_t num_objs, uint8_t defualt_state) {
-  size_t orig_object_map_size = m_object_map.size();
-  m_object_map.resize(num_objs);
-  for (uint64_t i = orig_object_map_size;
-       i < m_object_map.size(); ++i) {
-    m_object_map[i] = defualt_state;
-  }
-}
-
-bool ObjectMap::Request::should_complete(int r) {
-  CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << &m_image_ctx << " should_complete: r=" << r << dendl;
-
-  switch (m_state)
-  {
-  case STATE_REQUEST:
-    if (r == -EBUSY) {
-      lderr(cct) << "object map lock not owned by client" << dendl;
-      return invalidate();
-    } else if (r < 0) {
-      lderr(cct) << "failed to update object map: " << cpp_strerror(r)
-		 << dendl;
-      return invalidate();
-    }
-
-    {
-      RWLock::WLocker l2(m_image_ctx.object_map_lock);
-      finish(&m_image_ctx.object_map);
-    }
-    return true;
-
-  case STATE_INVALIDATE:
-    ldout(cct, 20) << "INVALIDATE" << dendl;
-    if (r < 0) {
-      lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r)
-		 << dendl;
-    }
-    return true;
-
-  default:
-    lderr(cct) << "invalid state: " << m_state << dendl;
-    assert(false);
-    break;
-  }
-  return false;
-}
-
-bool ObjectMap::Request::invalidate() {
-  if (m_image_ctx.test_flags(RBD_FLAG_OBJECT_MAP_INVALID)) {
-    return true;
-  }
-
-  CephContext *cct = m_image_ctx.cct;
-  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
-
-  // requests shouldn't be running while using snapshots
-  assert(m_image_ctx.snap_id == CEPH_NOSNAP);
-
-  uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID;
-  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
-    flags |= RBD_FLAG_FAST_DIFF_INVALID;
-  }
-
-  lderr(cct) << &m_image_ctx << " invalidating object map" << dendl;
-  m_state = STATE_INVALIDATE;
-  m_image_ctx.flags |= flags;
-
-  librados::ObjectWriteOperation op;
-  cls_client::set_flags(&op, CEPH_NOSNAP, flags, flags);
-
-  librados::AioCompletion *rados_completion = create_callback_completion();
-  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
-					 rados_completion, &op);
-  assert(r == 0);
-  rados_completion->release();
-  return false;
-}
-
-void ObjectMap::ResizeRequest::send() {
-  CephContext *cct = m_image_ctx.cct;
-
-  RWLock::WLocker l(m_image_ctx.object_map_lock);
-  m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size);
-
-  ldout(cct, 5) << &m_image_ctx << " resizing on-disk object map: "
-		<< m_num_objs << dendl;
-
-  librados::ObjectWriteOperation op;
-  if (m_snap_id == CEPH_NOSNAP) {
-    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
-  }
-  cls_client::object_map_resize(&op, m_num_objs, m_default_object_state);
-
-  librados::AioCompletion *rados_completion = create_callback_completion();
-  std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
-  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
-  assert(r == 0);
-  rados_completion->release();
-}
-
-void ObjectMap::ResizeRequest::finish(ObjectMap *object_map) {
-  CephContext *cct = m_image_ctx.cct;
-
-  ldout(cct, 5) << &m_image_ctx << " resizing in-memory object map: "
-		<< m_num_objs << dendl;
-  object_map->resize(m_num_objs, m_default_object_state);
-}
-
-void ObjectMap::UpdateRequest::send() {
-  assert(m_image_ctx.object_map_lock.is_locked());
-  CephContext *cct = m_image_ctx.cct;
-
-  // safe to update in-memory state first without handling rollback since any
-  // failures will invalidate the object map
-  ldout(cct, 20) << &m_image_ctx << " updating object map"
-                 << (m_snap_id != CEPH_NOSNAP ?
-                       " snap " + stringify(m_snap_id) : std::string())
-                 << ": ["
-		 << m_start_object_no << "," << m_end_object_no << ") = "
-		 << (m_current_state ?
-		       stringify(static_cast<uint32_t>(*m_current_state)) : "")
-		 << "->" << static_cast<uint32_t>(m_new_state)
-		 << dendl;
-
-  ObjectMap& object_map = m_image_ctx.object_map;
-  if (m_snap_id == object_map.m_snap_id) {
-    assert(m_image_ctx.object_map_lock.is_wlocked());
-    for (uint64_t object_no = m_start_object_no;
-         object_no < MIN(m_end_object_no, object_map.m_object_map.size());
-         ++object_no) {
-      uint8_t state = object_map.m_object_map[object_no];
-      if (!m_current_state || state == *m_current_state ||
-          (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
-        object_map.m_object_map[object_no] = m_new_state;
-      }
-    }
-  }
-
-  librados::ObjectWriteOperation op;
-  if (m_snap_id == CEPH_NOSNAP) {
-    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
-  }
-  cls_client::object_map_update(&op, m_start_object_no, m_end_object_no,
-				m_new_state, m_current_state);
-
-  librados::AioCompletion *rados_completion = create_callback_completion();
-  std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
-  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
-  assert(r == 0);
-  rados_completion->release();
-}
-
-void ObjectMap::UpdateRequest::finish(ObjectMap *object_map) {
-  ldout(m_image_ctx.cct, 20) << &m_image_ctx << " on-disk object map updated"
-                             << dendl;
-}
-
 } // namespace librbd
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
index 1737e12..5253c51 100644
--- a/src/librbd/ObjectMap.h
+++ b/src/librbd/ObjectMap.h
@@ -7,10 +7,10 @@
 #include "include/rados/librados.hpp"
 #include "include/rbd/object_map_types.h"
 #include "common/bit_vector.hpp"
-#include "librbd/AsyncRequest.h"
 #include <boost/optional.hpp>
 
 class Context;
+class RWLock;
 
 namespace librbd {
 
@@ -18,8 +18,7 @@ class ImageCtx;
 
 class ObjectMap {
 public:
-
-  ObjectMap(ImageCtx &image_ctx);
+  ObjectMap(ImageCtx &image_ctx, uint64_t snap_id);
 
   static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
   static std::string object_map_name(const std::string &image_id,
@@ -31,8 +30,9 @@ public:
     return m_object_map.size();
   }
 
-  int lock();
-  int unlock();
+  void open(Context *on_finish);
+  void lock(Context *on_finish);
+  void unlock(Context *on_finish);
 
   bool object_may_exist(uint64_t object_no) const;
 
@@ -52,100 +52,15 @@ public:
                   const boost::optional<uint8_t> &current_state,
                   Context *on_finish);
 
-  void refresh(uint64_t snap_id);
-  void rollback(uint64_t snap_id);
-  void snapshot_add(uint64_t snap_id);
-  int snapshot_remove(uint64_t snap_id);
-
-  bool enabled() const;
+  void rollback(uint64_t snap_id, Context *on_finish);
+  void snapshot_add(uint64_t snap_id, Context *on_finish);
+  void snapshot_remove(uint64_t snap_id, Context *on_finish);
 
 private:
-
-  class Request : public AsyncRequest<> {
-  public:
-    Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish)
-      : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id),
-        m_state(STATE_REQUEST)
-    {
-    }
-
-  protected:
-    const uint64_t m_snap_id;
-
-    virtual bool safely_cancel(int r) {
-      return false;
-    }
-    virtual bool should_complete(int r);
-    virtual int filter_return_code(int r) {
-      // never propagate an error back to the caller
-      return 0;
-    }
-    virtual void finish(ObjectMap *object_map) = 0;
-  private:
-    /**
-     * <start> ---> STATE_REQUEST ---> <finish>
-     *                   |                ^
-     *                   v                |
-     *            STATE_INVALIDATE -------/
-     */
-    enum State {
-      STATE_REQUEST,
-      STATE_INVALIDATE
-    };
-
-    State m_state;
-
-    bool invalidate();
-  };
-
-  class ResizeRequest : public Request {
-  public:
-    ResizeRequest(ImageCtx &image_ctx, uint64_t snap_id, uint64_t new_size,
-		  uint8_t default_object_state, Context *on_finish)
-      : Request(image_ctx, snap_id, on_finish), m_num_objs(0),
-        m_new_size(new_size), m_default_object_state(default_object_state)
-    {
-    }
-
-    virtual void send();
-  protected:
-    virtual void finish(ObjectMap *object_map);
-  private:
-    uint64_t m_num_objs;
-    uint64_t m_new_size;
-    uint8_t m_default_object_state;
-  };
-
-  class UpdateRequest : public Request {
-  public:
-    UpdateRequest(ImageCtx &image_ctx, uint64_t snap_id,
-                  uint64_t start_object_no, uint64_t end_object_no,
-                  uint8_t new_state,
-                  const boost::optional<uint8_t> &current_state,
-		  Context *on_finish)
-      : Request(image_ctx, snap_id, on_finish),
-        m_start_object_no(start_object_no), m_end_object_no(end_object_no),
-        m_new_state(new_state), m_current_state(current_state)
-    {
-    }
-
-    virtual void send();
-  protected:
-    virtual void finish(ObjectMap *object_map);
-  private:
-    uint64_t m_start_object_no;
-    uint64_t m_end_object_no;
-    uint8_t m_new_state;
-    boost::optional<uint8_t> m_current_state;
-  };
-
   ImageCtx &m_image_ctx;
   ceph::BitVector<2> m_object_map;
   uint64_t m_snap_id;
-  bool m_enabled;
 
-  void invalidate(uint64_t snap_id, bool force);
-  void resize(uint64_t num_objs, uint8_t default_state);
 };
 
 } // namespace librbd
diff --git a/src/librbd/Utils.cc b/src/librbd/Utils.cc
new file mode 100644
index 0000000..d89418f
--- /dev/null
+++ b/src/librbd/Utils.cc
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/Utils.h"
+#include "include/rbd_types.h"
+#include "include/stringify.h"
+
+namespace librbd {
+namespace util {
+
+const std::string id_obj_name(const std::string &name)
+{
+  return RBD_ID_PREFIX + name;
+}
+
+const std::string header_name(const std::string &image_id)
+{
+  return RBD_HEADER_PREFIX + image_id;
+}
+
+const std::string old_header_name(const std::string &image_name)
+{
+  return image_name + RBD_SUFFIX;
+}
+
+std::string unique_lock_name(const std::string &name, void *address) {
+  return name + " (" + stringify(address) + ")";
+}
+
+} // namespace util
+} // namespace librbd
diff --git a/src/librbd/Utils.h b/src/librbd/Utils.h
new file mode 100644
index 0000000..0986f06
--- /dev/null
+++ b/src/librbd/Utils.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_UTILS_H
+#define CEPH_LIBRBD_UTILS_H
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include <type_traits>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace util {
+
+namespace detail {
+
+template <typename T>
+void rados_callback(rados_completion_t c, void *arg) {
+  reinterpret_cast<T*>(arg)->complete(rados_aio_get_return_value(c));
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy>
+void rados_state_callback(rados_completion_t c, void *arg) {
+  T *obj = reinterpret_cast<T*>(arg);
+  int r = rados_aio_get_return_value(c);
+  Context *on_finish = (obj->*MF)(&r);
+  if (on_finish != nullptr) {
+    on_finish->complete(r);
+    if (destroy) {
+      delete obj;
+    }
+  }
+}
+
+template <typename T, void (T::*MF)(int)>
+class C_CallbackAdapter : public Context {
+  T *obj;
+public:
+  C_CallbackAdapter(T *obj) : obj(obj) {
+  }
+
+protected:
+  virtual void finish(int r) {
+    (obj->*MF)(r);
+  }
+};
+
+template <typename T, Context*(T::*MF)(int*), bool destroy>
+class C_StateCallbackAdapter : public Context {
+  T *obj;
+public:
+  C_StateCallbackAdapter(T *obj) : obj(obj){
+  }
+
+protected:
+  virtual void complete(int r) override {
+    Context *on_finish = (obj->*MF)(&r);
+    if (on_finish != nullptr) {
+      on_finish->complete(r);
+      if (destroy) {
+        delete obj;
+      }
+    }
+    Context::complete(r);
+  }
+  virtual void finish(int r) override {
+  }
+};
+
+template <typename WQ>
+struct C_AsyncCallback : public Context {
+  WQ *op_work_queue;
+  Context *on_finish;
+
+  C_AsyncCallback(WQ *op_work_queue, Context *on_finish)
+    : op_work_queue(op_work_queue), on_finish(on_finish) {
+  }
+  virtual void finish(int r) {
+    op_work_queue->queue(on_finish, r);
+  }
+};
+
+} // namespace detail
+
+const std::string id_obj_name(const std::string &name);
+const std::string header_name(const std::string &image_id);
+const std::string old_header_name(const std::string &image_name);
+std::string unique_lock_name(const std::string &name, void *address);
+
+template <typename T>
+librados::AioCompletion *create_rados_ack_callback(T *obj) {
+  return librados::Rados::aio_create_completion(
+    obj, &detail::rados_callback<T>, nullptr);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+librados::AioCompletion *create_rados_ack_callback(T *obj) {
+  return librados::Rados::aio_create_completion(
+    obj, &detail::rados_state_callback<T, MF, destroy>, nullptr);
+}
+
+template <typename T>
+librados::AioCompletion *create_rados_safe_callback(T *obj) {
+  return librados::Rados::aio_create_completion(
+    obj, nullptr, &detail::rados_callback<T>);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+librados::AioCompletion *create_rados_safe_callback(T *obj) {
+  return librados::Rados::aio_create_completion(
+    obj, nullptr, &detail::rados_state_callback<T, MF, destroy>);
+}
+
+template <typename T, void(T::*MF)(int) = &T::complete>
+Context *create_context_callback(T *obj) {
+  return new detail::C_CallbackAdapter<T, MF>(obj);
+}
+
+template <typename T, Context*(T::*MF)(int*), bool destroy=true>
+Context *create_context_callback(T *obj) {
+  return new detail::C_StateCallbackAdapter<T, MF, destroy>(obj);
+}
+
+template <typename I>
+Context *create_async_context_callback(I &image_ctx, Context *on_finish) {
+  // use async callback to acquire a clean lock context
+  return new detail::C_AsyncCallback<
+    typename std::decay<decltype(*image_ctx.op_work_queue)>::type>(
+      image_ctx.op_work_queue, on_finish);
+}
+
+} // namespace util
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_UTILS_H
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
index af2e434..93b12ee 100644
--- a/src/librbd/WatchNotifyTypes.cc
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -7,7 +7,7 @@
 #include "common/Formatter.h"
 
 namespace librbd {
-namespace WatchNotify {
+namespace watch_notify {
 
 namespace {
 
@@ -146,124 +146,105 @@ void HeaderUpdatePayload::decode(__u8 version, bufferlist::iterator &iter) {
 void HeaderUpdatePayload::dump(Formatter *f) const {
 }
 
-void AsyncProgressPayload::encode(bufferlist &bl) const {
+void AsyncRequestPayloadBase::encode(bufferlist &bl) const {
   ::encode(async_request_id, bl);
+}
+
+void AsyncRequestPayloadBase::decode(__u8 version, bufferlist::iterator &iter) {
+  ::decode(async_request_id, iter);
+}
+
+void AsyncRequestPayloadBase::dump(Formatter *f) const {
+  f->open_object_section("async_request_id");
+  async_request_id.dump(f);
+  f->close_section();
+}
+
+void AsyncProgressPayload::encode(bufferlist &bl) const {
+  AsyncRequestPayloadBase::encode(bl);
   ::encode(offset, bl);
   ::encode(total, bl);
 }
 
 void AsyncProgressPayload::decode(__u8 version, bufferlist::iterator &iter) {
-  ::decode(async_request_id, iter);
+  AsyncRequestPayloadBase::decode(version, iter);
   ::decode(offset, iter);
   ::decode(total, iter);
 }
 
 void AsyncProgressPayload::dump(Formatter *f) const {
-  f->open_object_section("async_request_id");
-  async_request_id.dump(f);
-  f->close_section();
+  AsyncRequestPayloadBase::dump(f);
   f->dump_unsigned("offset", offset);
   f->dump_unsigned("total", total);
 }
 
 void AsyncCompletePayload::encode(bufferlist &bl) const {
-  ::encode(async_request_id, bl);
+  AsyncRequestPayloadBase::encode(bl);
   ::encode(result, bl);
 }
 
 void AsyncCompletePayload::decode(__u8 version, bufferlist::iterator &iter) {
-  ::decode(async_request_id, iter);
+  AsyncRequestPayloadBase::decode(version, iter);
   ::decode(result, iter);
 }
 
 void AsyncCompletePayload::dump(Formatter *f) const {
-  f->open_object_section("async_request_id");
-  async_request_id.dump(f);
-  f->close_section();
+  AsyncRequestPayloadBase::dump(f);
   f->dump_int("result", result);
 }
 
-void FlattenPayload::encode(bufferlist &bl) const {
-  ::encode(async_request_id, bl);
-}
-
-void FlattenPayload::decode(__u8 version, bufferlist::iterator &iter) {
-  ::decode(async_request_id, iter);
-}
-
-void FlattenPayload::dump(Formatter *f) const {
-  f->open_object_section("async_request_id");
-  async_request_id.dump(f);
-  f->close_section();
-}
-
 void ResizePayload::encode(bufferlist &bl) const {
   ::encode(size, bl);
-  ::encode(async_request_id, bl);
+  AsyncRequestPayloadBase::encode(bl);
 }
 
 void ResizePayload::decode(__u8 version, bufferlist::iterator &iter) {
   ::decode(size, iter);
-  ::decode(async_request_id, iter);
+  AsyncRequestPayloadBase::decode(version, iter);
 }
 
 void ResizePayload::dump(Formatter *f) const {
   f->dump_unsigned("size", size);
-  f->open_object_section("async_request_id");
-  async_request_id.dump(f);
-  f->close_section();
+  AsyncRequestPayloadBase::dump(f);
 }
 
-void SnapCreatePayload::encode(bufferlist &bl) const {
+void SnapPayloadBase::encode(bufferlist &bl) const {
   ::encode(snap_name, bl);
 }
 
-void SnapCreatePayload::decode(__u8 version, bufferlist::iterator &iter) {
+void SnapPayloadBase::decode(__u8 version, bufferlist::iterator &iter) {
   ::decode(snap_name, iter);
 }
 
-void SnapCreatePayload::dump(Formatter *f) const {
+void SnapPayloadBase::dump(Formatter *f) const {
   f->dump_string("snap_name", snap_name);
 }
 
 void SnapRenamePayload::encode(bufferlist &bl) const {
-  ::encode(src_snap_id, bl);
-  ::encode(dst_snap_name, bl);
+  ::encode(snap_id, bl);
+  SnapPayloadBase::encode(bl);
 }
 
 void SnapRenamePayload::decode(__u8 version, bufferlist::iterator &iter) {
-  ::decode(src_snap_id, iter);
-  ::decode(dst_snap_name, iter);
+  ::decode(snap_id, iter);
+  SnapPayloadBase::decode(version, iter);
 }
 
 void SnapRenamePayload::dump(Formatter *f) const {
-  f->dump_unsigned("src_snap_id", src_snap_id);
-  f->dump_string("dst_snap_name", dst_snap_name);
-}
-void SnapRemovePayload::encode(bufferlist &bl) const {
-  ::encode(snap_name, bl);
-}
-
-void SnapRemovePayload::decode(__u8 version, bufferlist::iterator &iter) {
-  ::decode(snap_name, iter);
+  f->dump_unsigned("src_snap_id", snap_id);
+  SnapPayloadBase::dump(f);
 }
 
-void SnapRemovePayload::dump(Formatter *f) const {
-  f->dump_string("snap_name", snap_name);
+void RenamePayload::encode(bufferlist &bl) const {
+  ::encode(image_name, bl);
 }
 
-void RebuildObjectMapPayload::encode(bufferlist &bl) const {
-  ::encode(async_request_id, bl);
+void RenamePayload::decode(__u8 version, bufferlist::iterator &iter) {
+  ::decode(image_name, iter);
 }
 
-void RebuildObjectMapPayload::decode(__u8 version, bufferlist::iterator &iter) {
-  ::decode(async_request_id, iter);
-}
-
-void RebuildObjectMapPayload::dump(Formatter *f) const {
-  f->open_object_section("async_request_id");
-  async_request_id.dump(f);
-  f->close_section();
+void RenamePayload::dump(Formatter *f) const {
+  f->dump_string("image_name", image_name);
 }
 
 void UnknownPayload::encode(bufferlist &bl) const {
@@ -323,9 +304,18 @@ void NotifyMessage::decode(bufferlist::iterator& iter) {
   case NOTIFY_OP_SNAP_RENAME:
     payload = SnapRenamePayload();
     break;
+  case NOTIFY_OP_SNAP_PROTECT:
+    payload = SnapProtectPayload();
+    break;
+  case NOTIFY_OP_SNAP_UNPROTECT:
+    payload = SnapUnprotectPayload();
+    break;
   case NOTIFY_OP_REBUILD_OBJECT_MAP:
     payload = RebuildObjectMapPayload();
     break;
+  case NOTIFY_OP_RENAME:
+    payload = RenamePayload();
+    break;
   default:
     payload = UnknownPayload();
     break;
@@ -350,7 +340,10 @@ void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
   o.push_back(new NotifyMessage(ResizePayload(123, AsyncRequestId(ClientId(0, 1), 2))));
   o.push_back(new NotifyMessage(SnapCreatePayload("foo")));
   o.push_back(new NotifyMessage(SnapRemovePayload("foo")));
+  o.push_back(new NotifyMessage(SnapProtectPayload("foo")));
+  o.push_back(new NotifyMessage(SnapUnprotectPayload("foo")));
   o.push_back(new NotifyMessage(RebuildObjectMapPayload(AsyncRequestId(ClientId(0, 1), 2))));
+  o.push_back(new NotifyMessage(RenamePayload("foo")));
 }
 
 void ResponseMessage::encode(bufferlist& bl) const {
@@ -373,12 +366,12 @@ void ResponseMessage::generate_test_instances(std::list<ResponseMessage *> &o) {
   o.push_back(new ResponseMessage(1));
 }
 
-} // namespace WatchNotify
+} // namespace watch_notify
 } // namespace librbd
 
 std::ostream &operator<<(std::ostream &out,
-                         const librbd::WatchNotify::NotifyOp &op) {
-  using namespace librbd::WatchNotify;
+                         const librbd::watch_notify::NotifyOp &op) {
+  using namespace librbd::watch_notify;
 
   switch (op) {
   case NOTIFY_OP_ACQUIRED_LOCK:
@@ -414,9 +407,18 @@ std::ostream &operator<<(std::ostream &out,
   case NOTIFY_OP_SNAP_RENAME:
     out << "SnapRename";
     break;
+  case NOTIFY_OP_SNAP_PROTECT:
+    out << "SnapProtect";
+    break;
+  case NOTIFY_OP_SNAP_UNPROTECT:
+    out << "SnapUnprotect";
+    break;
   case NOTIFY_OP_REBUILD_OBJECT_MAP:
     out << "RebuildObjectMap";
     break;
+  case NOTIFY_OP_RENAME:
+    out << "Rename";
+    break;
   default:
     out << "Unknown (" << static_cast<uint32_t>(op) << ")";
     break;
@@ -425,13 +427,13 @@ std::ostream &operator<<(std::ostream &out,
 }
 
 std::ostream &operator<<(std::ostream &out,
-                         const librbd::WatchNotify::ClientId &client_id) {
+                         const librbd::watch_notify::ClientId &client_id) {
   out << "[" << client_id.gid << "," << client_id.handle << "]";
   return out;
 }
 
 std::ostream &operator<<(std::ostream &out,
-                         const librbd::WatchNotify::AsyncRequestId &request) {
+                         const librbd::watch_notify::AsyncRequestId &request) {
   out << "[" << request.client_id.gid << "," << request.client_id.handle << ","
       << request.request_id << "]";
   return out;
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
index eaa3305..468b45a 100644
--- a/src/librbd/WatchNotifyTypes.h
+++ b/src/librbd/WatchNotifyTypes.h
@@ -4,7 +4,7 @@
 #define LIBRBD_WATCH_NOTIFY_TYPES_H
 
 #include "include/int_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/encoding.h"
 #include <iosfwd>
 #include <list>
@@ -16,7 +16,7 @@ class Formatter;
 }
 
 namespace librbd {
-namespace WatchNotify {
+namespace watch_notify {
 
 struct ClientId {
   uint64_t gid;
@@ -84,7 +84,10 @@ enum NotifyOp {
   NOTIFY_OP_SNAP_CREATE        = 8,
   NOTIFY_OP_SNAP_REMOVE        = 9,
   NOTIFY_OP_REBUILD_OBJECT_MAP = 10,
-  NOTIFY_OP_SNAP_RENAME = 11
+  NOTIFY_OP_SNAP_RENAME        = 11,
+  NOTIFY_OP_SNAP_PROTECT       = 12,
+  NOTIFY_OP_SNAP_UNPROTECT     = 13,
+  NOTIFY_OP_RENAME             = 14
 };
 
 struct AcquiredLockPayload {
@@ -134,14 +137,26 @@ struct HeaderUpdatePayload {
   void dump(Formatter *f) const;
 };
 
-struct AsyncProgressPayload {
+struct AsyncRequestPayloadBase {
+public:
+  AsyncRequestId async_request_id;
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+
+protected:
+  AsyncRequestPayloadBase() {}
+  AsyncRequestPayloadBase(const AsyncRequestId &id) : async_request_id(id) {}
+};
+
+struct AsyncProgressPayload : public AsyncRequestPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_PROGRESS;
 
   AsyncProgressPayload() : offset(0), total(0) {}
   AsyncProgressPayload(const AsyncRequestId &id, uint64_t offset_, uint64_t total_)
-    : async_request_id(id), offset(offset_), total(total_) {}
+    : AsyncRequestPayloadBase(id), offset(offset_), total(total_) {}
 
-  AsyncRequestId async_request_id;
   uint64_t offset;
   uint64_t total;
 
@@ -150,14 +165,13 @@ struct AsyncProgressPayload {
   void dump(Formatter *f) const;
 };
 
-struct AsyncCompletePayload {
+struct AsyncCompletePayload : public AsyncRequestPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_COMPLETE;
 
   AsyncCompletePayload() {}
   AsyncCompletePayload(const AsyncRequestId &id, int r)
-    : async_request_id(id), result(r) {}
+    : AsyncRequestPayloadBase(id), result(r) {}
 
-  AsyncRequestId async_request_id;
   int result;
 
   void encode(bufferlist &bl) const;
@@ -165,82 +179,97 @@ struct AsyncCompletePayload {
   void dump(Formatter *f) const;
 };
 
-struct FlattenPayload {
+struct FlattenPayload : public AsyncRequestPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_FLATTEN;
 
   FlattenPayload() {}
-  FlattenPayload(const AsyncRequestId &id) : async_request_id(id) {}
-
-  AsyncRequestId async_request_id;
-
-  void encode(bufferlist &bl) const;
-  void decode(__u8 version, bufferlist::iterator &iter);
-  void dump(Formatter *f) const;
+  FlattenPayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {}
 };
 
-struct ResizePayload {
+struct ResizePayload : public AsyncRequestPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_RESIZE;
 
   ResizePayload() : size(0) {}
   ResizePayload(uint64_t size_, const AsyncRequestId &id)
-    : size(size_), async_request_id(id) {}
+    : AsyncRequestPayloadBase(id), size(size_) {}
 
   uint64_t size;
-  AsyncRequestId async_request_id;
 
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
   void dump(Formatter *f) const;
 };
 
-struct SnapCreatePayload {
-  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_CREATE;
-
-  SnapCreatePayload() {}
-  SnapCreatePayload(const std::string &name) : snap_name(name) {}
-
+struct SnapPayloadBase {
+public:
   std::string snap_name;
 
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
   void dump(Formatter *f) const;
+
+protected:
+  SnapPayloadBase() {}
+  SnapPayloadBase(const std::string &name) : snap_name(name) {}
+};
+
+struct SnapCreatePayload : public SnapPayloadBase {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_CREATE;
+
+  SnapCreatePayload() {}
+  SnapCreatePayload(const std::string &name) : SnapPayloadBase(name) {}
 };
 
-struct SnapRenamePayload {
+struct SnapRenamePayload : public SnapPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_RENAME;
 
   SnapRenamePayload() {}
   SnapRenamePayload(const uint64_t &src_snap_id, const std::string &dst_name)
-    : src_snap_id(src_snap_id), dst_snap_name(dst_name) {}
+    : SnapPayloadBase(dst_name), snap_id(src_snap_id) {}
 
-  uint64_t src_snap_id;
-  std::string dst_snap_name;
+  uint64_t snap_id;
 
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
   void dump(Formatter *f) const;
 };
 
-struct SnapRemovePayload {
+struct SnapRemovePayload : public SnapPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_REMOVE;
 
   SnapRemovePayload() {}
-  SnapRemovePayload(const std::string &name) : snap_name(name) {}
+  SnapRemovePayload(const std::string &name) : SnapPayloadBase(name) {}
+};
 
-  std::string snap_name;
+struct SnapProtectPayload : public SnapPayloadBase {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_PROTECT;
 
-  void encode(bufferlist &bl) const;
-  void decode(__u8 version, bufferlist::iterator &iter);
-  void dump(Formatter *f) const;
+  SnapProtectPayload() {}
+  SnapProtectPayload(const std::string &name) : SnapPayloadBase(name) {}
+};
+
+struct SnapUnprotectPayload : public SnapPayloadBase {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_UNPROTECT;
+
+  SnapUnprotectPayload() {}
+  SnapUnprotectPayload(const std::string &name) : SnapPayloadBase(name) {}
 };
 
-struct RebuildObjectMapPayload {
+struct RebuildObjectMapPayload : public AsyncRequestPayloadBase {
   static const NotifyOp NOTIFY_OP = NOTIFY_OP_REBUILD_OBJECT_MAP;
 
   RebuildObjectMapPayload() {}
-  RebuildObjectMapPayload(const AsyncRequestId &id) : async_request_id(id) {}
+  RebuildObjectMapPayload(const AsyncRequestId &id)
+    : AsyncRequestPayloadBase(id) {}
+};
 
-  AsyncRequestId async_request_id;
+struct RenamePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_RENAME;
+
+  RenamePayload() {}
+  RenamePayload(const std::string _image_name) : image_name(_image_name) {}
+
+  std::string image_name;
 
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
@@ -266,7 +295,10 @@ typedef boost::variant<AcquiredLockPayload,
                        SnapCreatePayload,
                        SnapRemovePayload,
                        SnapRenamePayload,
+                       SnapProtectPayload,
+                       SnapUnprotectPayload,
                        RebuildObjectMapPayload,
+                       RenamePayload,
                        UnknownPayload> Payload;
 
 struct NotifyMessage {
@@ -295,19 +327,19 @@ struct ResponseMessage {
   static void generate_test_instances(std::list<ResponseMessage *> &o);
 };
 
-} // namespace WatchNotify
+} // namespace watch_notify
 } // namespace librbd
 
 std::ostream &operator<<(std::ostream &out,
-                         const librbd::WatchNotify::NotifyOp &op);
+                         const librbd::watch_notify::NotifyOp &op);
 std::ostream &operator<<(std::ostream &out,
-                         const librbd::WatchNotify::ClientId &client);
+                         const librbd::watch_notify::ClientId &client);
 std::ostream &operator<<(std::ostream &out,
-                         const librbd::WatchNotify::AsyncRequestId &request);
+                         const librbd::watch_notify::AsyncRequestId &request);
 
-WRITE_CLASS_ENCODER(librbd::WatchNotify::ClientId);
-WRITE_CLASS_ENCODER(librbd::WatchNotify::AsyncRequestId);
-WRITE_CLASS_ENCODER(librbd::WatchNotify::NotifyMessage);
-WRITE_CLASS_ENCODER(librbd::WatchNotify::ResponseMessage);
+WRITE_CLASS_ENCODER(librbd::watch_notify::ClientId);
+WRITE_CLASS_ENCODER(librbd::watch_notify::AsyncRequestId);
+WRITE_CLASS_ENCODER(librbd::watch_notify::NotifyMessage);
+WRITE_CLASS_ENCODER(librbd::watch_notify::ResponseMessage);
 
 #endif // LIBRBD_WATCH_NOTIFY_TYPES_H
diff --git a/src/librbd/exclusive_lock/AcquireRequest.cc b/src/librbd/exclusive_lock/AcquireRequest.cc
new file mode 100644
index 0000000..ad59c04
--- /dev/null
+++ b/src/librbd/exclusive_lock/AcquireRequest.cc
@@ -0,0 +1,456 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/AcquireRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::AcquireRequest: "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_ack_callback;
+using util::create_rados_safe_callback;
+
+namespace {
+
+template <typename I>
+struct C_BlacklistClient : public Context {
+  I &image_ctx;
+  std::string locker_address;
+  Context *on_finish;
+
+  C_BlacklistClient(I &image_ctx, const std::string &locker_address,
+                    Context *on_finish)
+    : image_ctx(image_ctx), locker_address(locker_address),
+      on_finish(on_finish) {
+  }
+
+  virtual void finish(int r) override {
+    librados::Rados rados(image_ctx.md_ctx);
+    r = rados.blacklist_add(locker_address,
+                            image_ctx.blacklist_expire_seconds);
+    on_finish->complete(r);
+  }
+};
+
+} // anonymous namespace
+
+template <typename I>
+AcquireRequest<I>* AcquireRequest<I>::create(I &image_ctx,
+                                             const std::string &cookie,
+                                             Context *on_acquire,
+                                             Context *on_finish) {
+  return new AcquireRequest(image_ctx, cookie, on_acquire, on_finish);
+}
+
+template <typename I>
+AcquireRequest<I>::AcquireRequest(I &image_ctx, const std::string &cookie,
+                                  Context *on_acquire, Context *on_finish)
+  : m_image_ctx(image_ctx), m_cookie(cookie), m_on_acquire(on_acquire),
+    m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+    m_object_map(nullptr), m_journal(nullptr), m_error_result(0) {
+}
+
+template <typename I>
+AcquireRequest<I>::~AcquireRequest() {
+  delete m_on_acquire;
+}
+
+template <typename I>
+void AcquireRequest<I>::send() {
+  send_lock();
+}
+
+template <typename I>
+void AcquireRequest<I>::send_lock() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::lock(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, m_cookie,
+                         ExclusiveLock<I>::WATCHER_LOCK_TAG, "", utime_t(), 0);
+
+  using klass = AcquireRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_lock>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+                                         rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_lock(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val == 0) {
+    return send_open_object_map();
+  } else if (*ret_val != -EBUSY) {
+    lderr(cct) << "failed to lock: " << cpp_strerror(*ret_val) << dendl;
+    return m_on_finish;
+  }
+
+  send_get_lockers();
+  return nullptr;
+}
+
+template <typename I>
+Context *AcquireRequest<I>::send_open_journal() {
+  // alert caller that we now own the exclusive lock
+  m_on_acquire->complete(0);
+  m_on_acquire = nullptr;
+
+  if (!m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) {
+    apply();
+    return m_on_finish;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = AcquireRequest<I>;
+  Context *ctx = create_context_callback<klass, &klass::handle_open_journal>(
+    this);
+  m_journal = m_image_ctx.create_journal();
+
+  // journal playback required object map (if enabled) and itself
+  apply();
+
+  m_journal->open(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_open_journal(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0) {
+    lderr(cct) << "failed to open journal: " << cpp_strerror(*ret_val) << dendl;
+    m_error_result = *ret_val;
+    return send_unlock_object_map();
+  }
+
+  return m_on_finish;
+}
+
+template <typename I>
+Context *AcquireRequest<I>::send_open_object_map() {
+  if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+    return send_open_journal();
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = AcquireRequest<I>;
+  Context *ctx = create_context_callback<klass, &klass::handle_open_object_map>(
+    this);
+
+  m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP);
+  m_object_map->open(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_open_object_map(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  // object map should never result in an error
+  assert(*ret_val == 0);
+  return send_lock_object_map();
+}
+
+template <typename I>
+Context *AcquireRequest<I>::send_lock_object_map() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  assert(m_object_map != nullptr);
+
+  using klass = AcquireRequest<I>;
+  Context *ctx = create_context_callback<klass, &klass::handle_lock_object_map>(
+    this);
+  m_object_map->lock(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_lock_object_map(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  // object map should never result in an error
+  assert(*ret_val == 0);
+  return send_open_journal();
+}
+
+template <typename I>
+Context *AcquireRequest<I>::send_unlock_object_map() {
+  if (m_object_map == nullptr) {
+    revert();
+    return m_on_finish;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = AcquireRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_unlock_object_map>(this);
+  m_object_map->unlock(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_unlock_object_map(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  // object map should never result in an error
+  assert(*ret_val == 0);
+
+  assert(m_error_result < 0);
+  *ret_val = m_error_result;
+
+  revert();
+  return m_on_finish;
+}
+
+template <typename I>
+void AcquireRequest<I>::send_get_lockers() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+  using klass = AcquireRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_ack_callback<klass, &klass::handle_get_lockers>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+                                         rados_completion, &op, &m_out_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_get_lockers(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  std::map<rados::cls::lock::locker_id_t,
+           rados::cls::lock::locker_info_t> lockers;
+  ClsLockType lock_type;
+  std::string lock_tag;
+  if (*ret_val == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *ret_val = rados::cls::lock::get_lock_info_finish(&it, &lockers,
+                                                      &lock_type, &lock_tag);
+  }
+
+  if (*ret_val < 0) {
+    lderr(cct) << "failed to retrieve lockers: " << cpp_strerror(*ret_val)
+               << dendl;
+    return m_on_finish;
+  }
+
+  if (lockers.empty()) {
+    ldout(cct, 20) << "no lockers detected" << dendl;
+    send_lock();
+    return nullptr;
+  }
+
+  if (lock_tag != ExclusiveLock<I>::WATCHER_LOCK_TAG) {
+    ldout(cct, 5) <<"locked by external mechanism: tag=" << lock_tag << dendl;
+    *ret_val = -EBUSY;
+    return m_on_finish;
+  }
+
+  if (lock_type == LOCK_SHARED) {
+    ldout(cct, 5) << "shared lock type detected" << dendl;
+    *ret_val = -EBUSY;
+    return m_on_finish;
+  }
+
+  std::map<rados::cls::lock::locker_id_t,
+           rados::cls::lock::locker_info_t>::iterator iter = lockers.begin();
+  if (!ExclusiveLock<I>::decode_lock_cookie(iter->first.cookie,
+                                            &m_locker_handle)) {
+    ldout(cct, 5) << "locked by external mechanism: "
+                  << "cookie=" << iter->first.cookie << dendl;
+    *ret_val = -EBUSY;
+    return m_on_finish;
+  }
+
+  m_locker_entity = iter->first.locker;
+  m_locker_cookie = iter->first.cookie;
+  m_locker_address = stringify(iter->second.addr);
+  if (m_locker_cookie.empty() || m_locker_address.empty()) {
+    ldout(cct, 20) << "no valid lockers detected" << dendl;
+    send_lock();
+    return nullptr;
+  }
+
+  ldout(cct, 10) << "retrieved exclusive locker: "
+                 << m_locker_entity << "@" << m_locker_address << dendl;
+  send_get_watchers();
+  return nullptr;
+}
+
+template <typename I>
+void AcquireRequest<I>::send_get_watchers() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  op.list_watchers(&m_watchers, &m_watchers_ret_val);
+
+  using klass = AcquireRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_ack_callback<klass, &klass::handle_get_watchers>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+                                         rados_completion, &op, &m_out_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_get_watchers(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val == 0) {
+    *ret_val = m_watchers_ret_val;
+  }
+  if (*ret_val < 0) {
+    lderr(cct) << "failed to retrieve watchers: " << cpp_strerror(*ret_val)
+               << dendl;
+    return m_on_finish;
+  }
+
+  for (auto &watcher : m_watchers) {
+    if ((strncmp(m_locker_address.c_str(),
+                 watcher.addr, sizeof(watcher.addr)) == 0) &&
+        (m_locker_handle == watcher.cookie)) {
+      ldout(cct, 10) << "lock owner is still alive" << dendl;
+
+      *ret_val = -EAGAIN;
+      return m_on_finish;
+    }
+  }
+
+  send_blacklist();
+  return nullptr;
+}
+
+template <typename I>
+void AcquireRequest<I>::send_blacklist() {
+  if (!m_image_ctx.blacklist_on_break_lock) {
+    send_break_lock();
+    return;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  // TODO: need async version of RadosClient::blacklist_add
+  using klass = AcquireRequest<I>;
+  Context *ctx = create_context_callback<klass, &klass::handle_blacklist>(
+    this);
+  m_image_ctx.op_work_queue->queue(new C_BlacklistClient<I>(m_image_ctx,
+                                                            m_locker_address,
+                                                            ctx), 0);
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_blacklist(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0) {
+    lderr(cct) << "failed to blacklist lock owner: " << cpp_strerror(*ret_val)
+               << dendl;
+    return m_on_finish;
+  }
+  send_break_lock();
+  return nullptr;
+}
+
+template <typename I>
+void AcquireRequest<I>::send_break_lock() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, m_locker_cookie,
+                               m_locker_entity);
+
+  using klass = AcquireRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_break_lock>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+                                         rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *AcquireRequest<I>::handle_break_lock(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val == -ENOENT) {
+    *ret_val = 0;
+  } else if (*ret_val < 0) {
+    lderr(cct) << "failed to break lock: " << cpp_strerror(*ret_val) << dendl;
+    return m_on_finish;
+  }
+
+  send_lock();
+  return nullptr;
+}
+
+template <typename I>
+void AcquireRequest<I>::apply() {
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  assert(m_image_ctx.object_map == nullptr);
+  m_image_ctx.object_map = m_object_map;
+
+  assert(m_image_ctx.journal == nullptr);
+  m_image_ctx.journal = m_journal;
+}
+
+template <typename I>
+void AcquireRequest<I>::revert() {
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  m_image_ctx.object_map = nullptr;
+  m_image_ctx.journal = nullptr;
+
+  delete m_object_map;
+  delete m_journal;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::AcquireRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/AcquireRequest.h b/src/librbd/exclusive_lock/AcquireRequest.h
new file mode 100644
index 0000000..865b3c6
--- /dev/null
+++ b/src/librbd/exclusive_lock/AcquireRequest.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include "msg/msg_types.h"
+#include <map>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class Journal;
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class AcquireRequest {
+public:
+  static AcquireRequest* create(ImageCtxT &image_ctx, const std::string &cookie,
+                                Context *on_acquire, Context *on_finish);
+
+  ~AcquireRequest();
+  void send();
+
+private:
+
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |     /---------------------------------------------------------\
+   *    |     |                                                         |
+   *    |     |             (no lockers)                                |
+   *    |     |   . . . . . . . . . . . . . . . . . . . . .             |
+   *    |     |   .                                       .             |
+   *    |     v   v      (EBUSY)                          .             |
+   *    \--> LOCK_IMAGE * * * * * * * > GET_LOCKERS . . . .             |
+   *          .   |                       |                             |
+   *    . . . .   |                       |                             |
+   *    .         v                       v                             |
+   *    .     OPEN_OBJECT_MAP           GET_WATCHERS . . .              |
+   *    .         |                       |              .              |
+   *    .         v                       v              .              |
+   *    .     LOCK_OBJECT_MAP           BLACKLIST        . (blacklist   |
+   *    .         |                       |              .  disabled)   |
+   *    .         v                       v              .              |
+   *    . . > OPEN_JOURNAL * *          BREAK_LOCK < . . .              |
+   *    .         |          *            |                             |
+   *    .         |          v            |                             |
+   *    .         |    UNLOCK_OBJECT_MAP  |                             |
+   *    .         |          |            \-----------------------------/
+   *    .         v          |
+   *    . . > <finish> <-----/
+   * @endverbatim
+   */
+
+  AcquireRequest(ImageCtxT &image_ctx, const std::string &cookie,
+                 Context *on_acquire, Context *on_finish);
+
+  ImageCtxT &m_image_ctx;
+  std::string m_cookie;
+  Context *m_on_acquire;
+  Context *m_on_finish;
+
+  bufferlist m_out_bl;
+
+  std::list<obj_watch_t> m_watchers;
+  int m_watchers_ret_val;
+
+  decltype(m_image_ctx.object_map) m_object_map;
+  decltype(m_image_ctx.journal) m_journal;
+
+  entity_name_t m_locker_entity;
+  std::string m_locker_cookie;
+  std::string m_locker_address;
+  uint64_t m_locker_handle;
+
+  int m_error_result;
+
+  void send_lock();
+  Context *handle_lock(int *ret_val);
+
+  Context *send_open_journal();
+  Context *handle_open_journal(int *ret_val);
+
+  Context *send_open_object_map();
+  Context *handle_open_object_map(int *ret_val);
+
+  Context *send_lock_object_map();
+  Context *handle_lock_object_map(int *ret_val);
+
+  Context *send_unlock_object_map();
+  Context *handle_unlock_object_map(int *ret_val);
+
+  void send_get_lockers();
+  Context *handle_get_lockers(int *ret_val);
+
+  void send_get_watchers();
+  Context *handle_get_watchers(int *ret_val);
+
+  void send_blacklist();
+  Context *handle_blacklist(int *ret_val);
+
+  void send_break_lock();
+  Context *handle_break_lock(int *ret_val);
+
+  void apply();
+  void revert();
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::AcquireRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H
diff --git a/src/librbd/exclusive_lock/ReleaseRequest.cc b/src/librbd/exclusive_lock/ReleaseRequest.cc
new file mode 100644
index 0000000..3b65199
--- /dev/null
+++ b/src/librbd/exclusive_lock/ReleaseRequest.cc
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/exclusive_lock/ReleaseRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/lock/cls_lock_types.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::exclusive_lock::ReleaseRequest: "
+
+namespace librbd {
+namespace exclusive_lock {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+using util::create_rados_safe_callback;
+
+template <typename I>
+ReleaseRequest<I>* ReleaseRequest<I>::create(I &image_ctx,
+                                             const std::string &cookie,
+                                             Context *on_releasing,
+                                             Context *on_finish) {
+  return new ReleaseRequest(image_ctx, cookie, on_releasing, on_finish);
+}
+
+template <typename I>
+ReleaseRequest<I>::ReleaseRequest(I &image_ctx, const std::string &cookie,
+                                  Context *on_releasing, Context *on_finish)
+  : m_image_ctx(image_ctx), m_cookie(cookie), m_on_releasing(on_releasing),
+    m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+    m_object_map(nullptr), m_journal(nullptr) {
+}
+
+template <typename I>
+ReleaseRequest<I>::~ReleaseRequest() {
+  delete m_on_releasing;
+}
+
+template <typename I>
+void ReleaseRequest<I>::send() {
+  send_block_writes();
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_block_writes() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = ReleaseRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_block_writes>(this);
+
+  {
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    m_image_ctx.aio_work_queue->block_writes(ctx);
+  }
+}
+
+template <typename I>
+Context *ReleaseRequest<I>::handle_block_writes(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0) {
+    m_image_ctx.aio_work_queue->unblock_writes();
+    return m_on_finish;
+  }
+
+  send_cancel_op_requests();
+  return nullptr;
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_cancel_op_requests() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = ReleaseRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_cancel_op_requests>(this);
+  m_image_ctx.cancel_async_requests(ctx);
+}
+
+template <typename I>
+Context *ReleaseRequest<I>::handle_cancel_op_requests(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  assert(*ret_val == 0);
+
+  if (m_on_releasing != nullptr) {
+    // alert caller that we no longer own the exclusive lock
+    m_on_releasing->complete(0);
+    m_on_releasing = nullptr;
+  }
+
+  send_close_journal();
+  return nullptr;
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_close_journal() {
+  {
+    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+    std::swap(m_journal, m_image_ctx.journal);
+  }
+
+  if (m_journal == nullptr) {
+    send_unlock_object_map();
+    return;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = ReleaseRequest<I>;
+  Context *ctx = create_context_callback<klass, &klass::handle_close_journal>(
+    this);
+  m_journal->close(ctx);
+}
+
+template <typename I>
+Context *ReleaseRequest<I>::handle_close_journal(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0) {
+    // error implies some journal events were not flushed -- continue
+    lderr(cct) << "failed to close journal: " << cpp_strerror(*ret_val)
+               << dendl;
+  }
+
+  delete m_journal;
+
+  send_unlock_object_map();
+  return nullptr;
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_unlock_object_map() {
+  {
+    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+    std::swap(m_object_map, m_image_ctx.object_map);
+  }
+
+  if (m_object_map == nullptr) {
+    send_unlock();
+    return;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = ReleaseRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_unlock_object_map>(this);
+  m_object_map->unlock(ctx);
+}
+
+template <typename I>
+Context *ReleaseRequest<I>::handle_unlock_object_map(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  // object map shouldn't return errors
+  assert(*ret_val == 0);
+  delete m_object_map;
+
+  send_unlock();
+  return nullptr;
+}
+
+template <typename I>
+void ReleaseRequest<I>::send_unlock() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::unlock(&op, RBD_LOCK_NAME, m_cookie);
+
+  using klass = ReleaseRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_unlock>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+                                         rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *ReleaseRequest<I>::handle_unlock(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0 && *ret_val != -ENOENT) {
+    lderr(cct) << "failed to unlock: " << cpp_strerror(*ret_val) << dendl;
+  }
+
+  // treat errors as the image is unlocked
+  *ret_val = 0;
+  return m_on_finish;
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+template class librbd::exclusive_lock::ReleaseRequest<librbd::ImageCtx>;
diff --git a/src/librbd/exclusive_lock/ReleaseRequest.h b/src/librbd/exclusive_lock/ReleaseRequest.h
new file mode 100644
index 0000000..b23e6c7
--- /dev/null
+++ b/src/librbd/exclusive_lock/ReleaseRequest.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_RELEASE_REQUEST_H
+#define CEPH_LIBRBD_EXCLUSIVE_LOCK_RELEASE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+class Journal;
+
+namespace exclusive_lock {
+
+template <typename ImageCtxT = ImageCtx>
+class ReleaseRequest {
+public:
+  static ReleaseRequest* create(ImageCtxT &image_ctx, const std::string &cookie,
+                                Context *on_releasing, Context *on_finish);
+
+  ~ReleaseRequest();
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * BLOCK_WRITES
+   *    |
+   *    v
+   * CANCEL_OP_REQUESTS . . . . . . . . . . . .
+   *    |                                     .
+   *    v                                     .
+   * CLOSE_JOURNAL                            .
+   *    |                (journal disabled,   .
+   *    v                 object map enabled) .
+   * UNLOCK_OBJECT_MAP  < . . . . . . . . . . .
+   *    |                                     .
+   *    v               (object map disabled) .
+   * UNLOCK < . . . . . . . . . . . . . . . . .
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   */
+
+  ReleaseRequest(ImageCtxT &image_ctx, const std::string &cookie,
+                 Context *on_releasing, Context *on_finish);
+
+  ImageCtxT &m_image_ctx;
+  std::string m_cookie;
+  Context *m_on_releasing;
+  Context *m_on_finish;
+
+  decltype(m_image_ctx.object_map) m_object_map;
+  decltype(m_image_ctx.journal) m_journal;
+
+  void send_block_writes();
+  Context *handle_block_writes(int *ret_val);
+
+  void send_cancel_op_requests();
+  Context *handle_cancel_op_requests(int *ret_val);
+
+  void send_close_journal();
+  Context *handle_close_journal(int *ret_val);
+
+  void send_unlock_object_map();
+  Context *handle_unlock_object_map(int *ret_val);
+
+  void send_unlock();
+  Context *handle_unlock(int *ret_val);
+
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+extern template class librbd::exclusive_lock::ReleaseRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_RELEASE_REQUEST_H
diff --git a/src/librbd/image/CloseRequest.cc b/src/librbd/image/CloseRequest.cc
new file mode 100644
index 0000000..57f04a7
--- /dev/null
+++ b/src/librbd/image/CloseRequest.cc
@@ -0,0 +1,243 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/CloseRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::CloseRequest: "
+
+namespace librbd {
+namespace image {
+
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+CloseRequest<I>::CloseRequest(I *image_ctx, Context *on_finish)
+  : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0),
+    m_exclusive_lock(nullptr) {
+  assert(image_ctx != nullptr);
+}
+
+template <typename I>
+void CloseRequest<I>::send() {
+  // TODO
+  send_shut_down_aio_queue();
+  //send_unregister_image_watcher();
+}
+
+template <typename I>
+void CloseRequest<I>::send_unregister_image_watcher() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  // prevent incoming requests from our peers
+
+}
+
+template <typename I>
+void CloseRequest<I>::handle_unregister_image_watcher(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  save_result(r);
+  if (r < 0) {
+    lderr(cct) << "failed to unregister image watcher: " << cpp_strerror(r)
+               << dendl;
+  }
+
+  send_shut_down_aio_queue();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_aio_queue() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+  m_image_ctx->aio_work_queue->shut_down(create_context_callback<
+    CloseRequest<I>, &CloseRequest<I>::handle_shut_down_aio_queue>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_aio_queue(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  send_shut_down_exclusive_lock();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_exclusive_lock() {
+  {
+    RWLock::WLocker owner_locker(m_image_ctx->owner_lock);
+    RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+    std::swap(m_exclusive_lock, m_image_ctx->exclusive_lock);
+
+    if (m_exclusive_lock == nullptr) {
+      delete m_image_ctx->object_map;
+      m_image_ctx->object_map = nullptr;
+    }
+  }
+
+  if (m_exclusive_lock == nullptr) {
+    send_flush();
+    return;
+  }
+
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_exclusive_lock->shut_down(create_context_callback<
+    CloseRequest<I>, &CloseRequest<I>::handle_shut_down_exclusive_lock>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_exclusive_lock(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  // object map and journal closed during exclusive lock shutdown
+  assert(m_image_ctx->journal == nullptr);
+  assert(m_image_ctx->object_map == nullptr);
+  delete m_exclusive_lock;
+
+  save_result(r);
+  if (r < 0) {
+    lderr(cct) << "failed to shut down exclusive lock: " << cpp_strerror(r)
+               << dendl;
+  }
+  send_flush_readahead();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+  m_image_ctx->flush(create_async_context_callback(
+    *m_image_ctx, create_context_callback<
+      CloseRequest<I>, &CloseRequest<I>::handle_flush>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl;
+  }
+  send_flush_readahead();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_readahead() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_image_ctx->readahead.wait_for_pending(create_context_callback<
+    CloseRequest<I>, &CloseRequest<I>::handle_flush_readahead>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_readahead(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  send_shut_down_cache();
+}
+
+template <typename I>
+void CloseRequest<I>::send_shut_down_cache() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_image_ctx->shut_down_cache(create_context_callback<
+    CloseRequest<I>, &CloseRequest<I>::handle_shut_down_cache>(this));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_shut_down_cache(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  save_result(r);
+  if (r < 0) {
+    lderr(cct) << "failed to shut down cache: " << cpp_strerror(r) << dendl;
+  }
+  send_flush_op_work_queue();
+}
+
+template <typename I>
+void CloseRequest<I>::send_flush_op_work_queue() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_image_ctx->op_work_queue->queue(create_context_callback<
+    CloseRequest<I>, &CloseRequest<I>::handle_flush_op_work_queue>(this), 0);
+}
+
+template <typename I>
+void CloseRequest<I>::handle_flush_op_work_queue(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+  send_close_parent();
+}
+
+template <typename I>
+void CloseRequest<I>::send_close_parent() {
+  if (m_image_ctx->parent == nullptr) {
+    finish();
+    return;
+  }
+
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_image_ctx->parent->state->close(create_async_context_callback(
+    *m_image_ctx, create_context_callback<
+      CloseRequest<I>, &CloseRequest<I>::handle_close_parent>(this)));
+}
+
+template <typename I>
+void CloseRequest<I>::handle_close_parent(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl;
+
+  delete m_image_ctx->parent;
+  save_result(r);
+  if (r < 0) {
+    lderr(cct) << "error closing parent image: " << cpp_strerror(r) << dendl;
+  }
+  finish();
+}
+
+template <typename I>
+void CloseRequest<I>::finish() {
+  if (m_image_ctx->image_watcher) {
+    m_image_ctx->unregister_watch();
+  }
+
+  m_on_finish->complete(m_error_result);
+  delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::CloseRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/CloseRequest.h b/src/librbd/image/CloseRequest.h
new file mode 100644
index 0000000..08ace95
--- /dev/null
+++ b/src/librbd/image/CloseRequest.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class CloseRequest {
+public:
+  static CloseRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+    return new CloseRequest(image_ctx, on_finish);
+  }
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * UNREGISTER_IMAGE_WATCHER
+   *    |
+   *    v
+   * SHUT_DOWN_AIO_WORK_QUEUE . . .
+   *    |                         .
+   *    v                         .
+   * SHUT_DOWN_EXCLUSIVE_LOCK     . (exclusive lock
+   *    |                         .  disabled)
+   *    v                         v
+   * FLUSH  < . . . . . . . . . . .
+   *    |
+   *    v
+   * FLUSH_READAHEAD
+   *    |
+   *    v
+   * SHUTDOWN_CACHE
+   *    |
+   *    v
+   * FLUSH_OP_WORK_QUEUE  . . . . .
+   *    |                         .
+   *    v                         .
+   * CLOSE_PARENT                 . (no parent)
+   *    |                         .
+   *    v                         .
+   * <finish> < . . . . . . . . . .
+   *
+   * @endverbatim
+   */
+
+  CloseRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+  ImageCtxT *m_image_ctx;
+  Context *m_on_finish;
+
+  int m_error_result;
+
+  decltype(m_image_ctx->exclusive_lock) m_exclusive_lock;
+
+  void send_unregister_image_watcher();
+  void handle_unregister_image_watcher(int r);
+
+  void send_shut_down_aio_queue();
+  void handle_shut_down_aio_queue(int r);
+
+  void send_shut_down_exclusive_lock();
+  void handle_shut_down_exclusive_lock(int r);
+
+  void send_flush();
+  void handle_flush(int r);
+
+  void send_flush_readahead();
+  void handle_flush_readahead(int r);
+
+  void send_shut_down_cache();
+  void handle_shut_down_cache(int r);
+
+  void send_flush_op_work_queue();
+  void handle_flush_op_work_queue(int r);
+
+  void send_close_parent();
+  void handle_close_parent(int r);
+
+  void finish();
+
+  void save_result(int result) {
+    if (m_error_result == 0 && result < 0) {
+      m_error_result = result;
+    }
+  }
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::CloseRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H
diff --git a/src/librbd/image/OpenRequest.cc b/src/librbd/image/OpenRequest.cc
new file mode 100644
index 0000000..cb0979a
--- /dev/null
+++ b/src/librbd/image/OpenRequest.cc
@@ -0,0 +1,375 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/OpenRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/RefreshRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::OpenRequest: "
+
+namespace librbd {
+namespace image {
+
+namespace {
+
+template <typename I>
+class C_RegisterWatch : public Context {
+public:
+  I &image_ctx;
+  Context *on_finish;
+
+  C_RegisterWatch(I &image_ctx, Context *on_finish)
+    : image_ctx(image_ctx), on_finish(on_finish) {
+  }
+
+  virtual void finish(int r) {
+    assert(r == 0);
+    on_finish->complete(image_ctx.register_watch());
+  }
+};
+
+} // anonymous namespace
+
+using util::create_context_callback;
+using util::create_rados_ack_callback;
+
+template <typename I>
+OpenRequest<I>::OpenRequest(I *image_ctx, Context *on_finish)
+  : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0) {
+}
+
+template <typename I>
+void OpenRequest<I>::send() {
+  send_v2_detect_header();
+}
+
+template <typename I>
+void OpenRequest<I>::send_v1_detect_header() {
+  librados::ObjectReadOperation op;
+  op.stat(NULL, NULL, NULL);
+
+  using klass = OpenRequest<I>;
+  librados::AioCompletion *comp =
+    create_rados_ack_callback<klass, &klass::handle_v1_detect_header>(this);
+  m_out_bl.clear();
+  m_image_ctx->md_ctx.aio_operate(util::old_header_name(m_image_ctx->name),
+                                 comp, &op, &m_out_bl);
+  comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v1_detect_header(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    if (*result != -ENOENT) {
+      lderr(cct) << "failed to stat image header: " << cpp_strerror(*result)
+                 << dendl;
+    }
+    send_close_image(*result);
+  } else {
+    m_image_ctx->old_format = true;
+    m_image_ctx->header_oid = util::old_header_name(m_image_ctx->name);
+    send_register_watch();
+  }
+  return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_detect_header() {
+  if (m_image_ctx->id.empty()) {
+    CephContext *cct = m_image_ctx->cct;
+    ldout(cct, 10) << this << " " << __func__ << dendl;
+
+    librados::ObjectReadOperation op;
+    op.stat(NULL, NULL, NULL);
+
+    using klass = OpenRequest<I>;
+    librados::AioCompletion *comp =
+      create_rados_ack_callback<klass, &klass::handle_v2_detect_header>(this);
+    m_out_bl.clear();
+    m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+                                   comp, &op, &m_out_bl);
+    comp->release();
+  } else {
+    send_v2_get_immutable_metadata();
+  }
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_detect_header(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result == -ENOENT) {
+    send_v1_detect_header();
+  } else if (*result < 0) {
+    lderr(cct) << "failed to stat v2 image header: " << cpp_strerror(*result)
+               << dendl;
+    send_close_image(*result);
+  } else {
+    m_image_ctx->old_format = false;
+    send_v2_get_id();
+  }
+  return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_id() {
+  if (m_image_ctx->id.empty()) {
+    CephContext *cct = m_image_ctx->cct;
+    ldout(cct, 10) << this << " " << __func__ << dendl;
+
+    librados::ObjectReadOperation op;
+    cls_client::get_id_start(&op);
+
+    using klass = OpenRequest<I>;
+    librados::AioCompletion *comp =
+      create_rados_ack_callback<klass, &klass::handle_v2_get_id>(this);
+    m_out_bl.clear();
+    m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+                                    comp, &op, &m_out_bl);
+    comp->release();
+  } else {
+    send_v2_get_immutable_metadata();
+  }
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_id(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *result = cls_client::get_id_finish(&it, &m_image_ctx->id);
+  }
+  if (*result < 0) {
+    lderr(cct) << "failed to retrieve image id: " << cpp_strerror(*result)
+               << dendl;
+    send_close_image(*result);
+  } else {
+    send_v2_get_immutable_metadata();
+  }
+  return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_immutable_metadata() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_image_ctx->old_format = false;
+  m_image_ctx->header_oid = util::header_name(m_image_ctx->id);
+
+  librados::ObjectReadOperation op;
+  cls_client::get_immutable_metadata_start(&op);
+
+  using klass = OpenRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v2_get_immutable_metadata>(this);
+  m_out_bl.clear();
+  m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+                                  &m_out_bl);
+  comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_immutable_metadata(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *result = cls_client::get_immutable_metadata_finish(
+      &it, &m_image_ctx->object_prefix, &m_image_ctx->order);
+  }
+  if (*result < 0) {
+    lderr(cct) << "failed to retreive immutable metadata: "
+               << cpp_strerror(*result) << dendl;
+    send_close_image(*result);
+  } else {
+    send_v2_get_stripe_unit_count();
+  }
+
+  return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_v2_get_stripe_unit_count() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::get_stripe_unit_count_start(&op);
+
+  using klass = OpenRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v2_get_stripe_unit_count>(this);
+  m_out_bl.clear();
+  m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op,
+                                  &m_out_bl);
+  comp->release();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_v2_get_stripe_unit_count(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *result = cls_client::get_stripe_unit_count_finish(
+      &it, &m_image_ctx->stripe_unit, &m_image_ctx->stripe_count);
+  }
+
+  if (*result == -ENOEXEC || *result == -EINVAL) {
+    *result = 0;
+  }
+
+  if (*result < 0) {
+    lderr(cct) << "failed to read striping metadata: " << cpp_strerror(*result)
+               << dendl;
+    send_close_image(*result);
+  } else {
+    send_register_watch();
+  }
+  return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_register_watch() {
+  m_image_ctx->init();
+
+  if (!m_image_ctx->read_only) {
+    CephContext *cct = m_image_ctx->cct;
+    ldout(cct, 10) << this << " " << __func__ << dendl;
+
+    // no librados async version of watch
+    using klass = OpenRequest<I>;
+    Context *ctx = new C_RegisterWatch<I>(
+      *m_image_ctx,
+      create_context_callback<klass, &klass::handle_register_watch>(this));
+    m_image_ctx->op_work_queue->queue(ctx);
+  } else {
+    send_refresh();
+  }
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_register_watch(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to register watch: " << cpp_strerror(*result)
+               << dendl;
+    send_close_image(*result);
+  } else {
+    send_refresh();
+  }
+  return nullptr;
+}
+
+template <typename I>
+void OpenRequest<I>::send_refresh() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  using klass = OpenRequest<I>;
+  RefreshRequest<I> *ctx = RefreshRequest<I>::create(
+    *m_image_ctx,
+    create_context_callback<klass, &klass::handle_refresh>(this));
+  ctx->send();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_refresh(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to refresh image: " << cpp_strerror(*result)
+               << dendl;
+    send_close_image(*result);
+    return nullptr;
+  } else {
+    return send_set_snap(result);
+  }
+}
+
+template <typename I>
+Context *OpenRequest<I>::send_set_snap(int *result) {
+  if (m_image_ctx->snap_name.empty()) {
+    *result = 0;
+    return m_on_finish;
+  }
+
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  using klass = OpenRequest<I>;
+  SetSnapRequest<I> *ctx = SetSnapRequest<I>::create(
+    *m_image_ctx, m_image_ctx->snap_name,
+    create_context_callback<klass, &klass::handle_set_snap>(this));
+  ctx->send();
+  return nullptr;
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_set_snap(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to set image snapshot: " << cpp_strerror(*result)
+               << dendl;
+    send_close_image(*result);
+    return nullptr;
+  }
+
+  return m_on_finish;
+}
+
+template <typename I>
+void OpenRequest<I>::send_close_image(int error_result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_error_result = error_result;
+
+  using klass = OpenRequest<I>;
+  Context *ctx = create_context_callback<klass, &klass::handle_close_image>(
+    this);
+  CloseRequest<I> *req = CloseRequest<I>::create(m_image_ctx, ctx);
+  req->send();
+}
+
+template <typename I>
+Context *OpenRequest<I>::handle_close_image(int *result) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to close image: " << cpp_strerror(*result) << dendl;
+  }
+  if (m_error_result < 0) {
+    *result = m_error_result;
+  }
+  return m_on_finish;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::OpenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/OpenRequest.h b/src/librbd/image/OpenRequest.h
new file mode 100644
index 0000000..599fdce
--- /dev/null
+++ b/src/librbd/image/OpenRequest.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class OpenRequest {
+public:
+  static OpenRequest *create(ImageCtxT *image_ctx, Context *on_finish) {
+    return new OpenRequest(image_ctx, on_finish);
+  }
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    | (v1)                          (read only)
+   *    |-----> V1_DETECT_HEADER  . . . . . . . . . . . . . . . . .
+   *    |           |                                             .
+   *    |           \-------------------------------\             .
+   *    | (v2)                                      |             .
+   *    \-----> V2_DETECT_HEADER                    |             .
+   *                |                               |             .
+   *                v                               |             .
+   *            V2_GET_ID                           |             .
+   *                |                               |             .
+   *                v                               |             .
+   *            V2_GET_IMMUTABLE_METADATA           |             .
+   *                |                               |             .
+   *                v                               v             .
+   *            V2_GET_STRIPE_UNIT_COUNT  ----> REGISTER_WATCH    .
+   *                .                               |             .
+   *                .  (read only)                  v             .
+   *                . . . . . . . . . . . . . > REFRESH < . . . . .
+   *                                             .   |
+   *                                             .   |
+   *                                             .   \--> SET_SNAP
+   *                                   (no snap) .          |
+   *                                             .          v
+   *                                             . . . > <finish>
+   *                                                        ^
+   *     (on error)                                         |
+   *    * * * * * * > CLOSE --------------------------------/
+   *
+   * @endverbatim
+   */
+
+  OpenRequest(ImageCtxT *image_ctx, Context *on_finish);
+
+  ImageCtxT *m_image_ctx;
+  Context *m_on_finish;
+
+  bufferlist m_out_bl;
+  int m_error_result;
+
+  void send_v1_detect_header();
+  Context *handle_v1_detect_header(int *result);
+
+  void send_v2_detect_header();
+  Context *handle_v2_detect_header(int *result);
+
+  void send_v2_get_id();
+  Context *handle_v2_get_id(int *result);
+
+  void send_v2_get_immutable_metadata();
+  Context *handle_v2_get_immutable_metadata(int *result);
+
+  void send_v2_get_stripe_unit_count();
+  Context *handle_v2_get_stripe_unit_count(int *result);
+
+  void send_register_watch();
+  Context *handle_register_watch(int *result);
+
+  void send_refresh();
+  Context *handle_refresh(int *result);
+
+  Context *send_set_snap(int *result);
+  Context *handle_set_snap(int *result);
+
+  void send_close_image(int error_result);
+  Context *handle_close_image(int *result);
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::OpenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H
diff --git a/src/librbd/image/RefreshParentRequest.cc b/src/librbd/image/RefreshParentRequest.cc
new file mode 100644
index 0000000..f9ce4f1
--- /dev/null
+++ b/src/librbd/image/RefreshParentRequest.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/RefreshParentRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/image/CloseRequest.h"
+#include "librbd/image/OpenRequest.h"
+#include "librbd/image/SetSnapRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RefreshParentRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_async_context_callback;
+using util::create_context_callback;
+
+template <typename I>
+RefreshParentRequest<I>::RefreshParentRequest(I &child_image_ctx,
+                                              const parent_info &parent_md,
+                                              Context *on_finish)
+  : m_child_image_ctx(child_image_ctx), m_parent_md(parent_md),
+    m_on_finish(on_finish), m_parent_image_ctx(nullptr),
+    m_parent_snap_id(CEPH_NOSNAP), m_error_result(0) {
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_refresh_required(I &child_image_ctx,
+                                                  const parent_info &parent_md) {
+  assert(child_image_ctx.snap_lock.is_locked());
+  assert(child_image_ctx.parent_lock.is_locked());
+  return (is_open_required(child_image_ctx, parent_md) ||
+          is_close_required(child_image_ctx, parent_md));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_close_required(I &child_image_ctx,
+                                                const parent_info &parent_md) {
+  return (child_image_ctx.parent != nullptr &&
+          (parent_md.spec.pool_id == -1 || parent_md.overlap == 0));
+}
+
+template <typename I>
+bool RefreshParentRequest<I>::is_open_required(I &child_image_ctx,
+                                               const parent_info &parent_md) {
+  return (parent_md.spec.pool_id > -1 && parent_md.overlap > 0 &&
+          (child_image_ctx.parent == nullptr ||
+           child_image_ctx.parent->md_ctx.get_id() != parent_md.spec.pool_id ||
+           child_image_ctx.parent->id != parent_md.spec.image_id ||
+           child_image_ctx.parent->snap_id != parent_md.spec.snap_id));
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send() {
+  if (is_open_required(m_child_image_ctx, m_parent_md)) {
+    send_open_parent();
+  } else {
+    // parent will be closed (if necessary) during finalize
+    send_complete(0);
+  }
+}
+
+template <typename I>
+void RefreshParentRequest<I>::apply() {
+  if (m_child_image_ctx.parent != nullptr) {
+    // closing parent image
+    m_child_image_ctx.clear_nonexistence_cache();
+  }
+  assert(m_child_image_ctx.snap_lock.is_wlocked());
+  assert(m_child_image_ctx.parent_lock.is_wlocked());
+  std::swap(m_child_image_ctx.parent, m_parent_image_ctx);
+}
+
+template <typename I>
+void RefreshParentRequest<I>::finalize(Context *on_finish) {
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_on_finish = on_finish;
+  if (m_parent_image_ctx != nullptr) {
+    send_close_parent();
+  } else {
+    send_complete(0);
+  }
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_open_parent() {
+  assert(m_parent_md.spec.pool_id >= 0);
+
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::Rados rados(m_child_image_ctx.md_ctx);
+
+  librados::IoCtx parent_io_ctx;
+  int r = rados.ioctx_create2(m_parent_md.spec.pool_id, parent_io_ctx);
+  assert(r == 0);
+
+  // since we don't know the image and snapshot name, set their ids and
+  // reset the snap_name and snap_exists fields after we read the header
+  m_parent_image_ctx = new I("", m_parent_md.spec.image_id, NULL, parent_io_ctx,
+                             true);
+
+  // set rados flags for reading the parent image
+  if (m_child_image_ctx.balance_parent_reads) {
+    m_parent_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS);
+  } else if (m_child_image_ctx.localize_parent_reads) {
+    m_parent_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS);
+  }
+
+  using klass = RefreshParentRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_open_parent, false>(this);
+  OpenRequest<I> *req = OpenRequest<I>::create(m_parent_image_ctx, ctx);
+  req->send();
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_open_parent(int *result) {
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+  save_result(result);
+  if (*result < 0) {
+    lderr(cct) << "failed to open parent image: " << cpp_strerror(*result)
+               << dendl;
+    send_close_parent();
+    return nullptr;
+  }
+
+  send_set_parent_snap();
+  return nullptr;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_set_parent_snap() {
+  assert(m_parent_md.spec.snap_id != CEPH_NOSNAP);
+
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  int r;
+  std::string snap_name;
+  {
+    RWLock::RLocker snap_locker(m_parent_image_ctx->snap_lock);
+    r = m_parent_image_ctx->get_snap_name(m_parent_md.spec.snap_id, &snap_name);
+  }
+
+  if (r < 0) {
+    lderr(cct) << "failed to located snapshot: " << cpp_strerror(r) << dendl;
+    send_complete(r);
+    return;
+  }
+
+  using klass = RefreshParentRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_set_parent_snap, false>(this);
+  SetSnapRequest<I> *req = SetSnapRequest<I>::create(
+    *m_parent_image_ctx, snap_name, ctx);
+  req->send();
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_set_parent_snap(int *result) {
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+  save_result(result);
+  if (*result < 0) {
+    lderr(cct) << "failed to set parent snapshot: " << cpp_strerror(*result)
+               << dendl;
+    send_close_parent();
+    return nullptr;
+  }
+
+  return m_on_finish;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_close_parent() {
+  assert(m_parent_image_ctx != nullptr);
+
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  using klass = RefreshParentRequest<I>;
+  Context *ctx = create_async_context_callback(
+    m_child_image_ctx, create_context_callback<
+      klass, &klass::handle_close_parent, false>(this));
+  CloseRequest<I> *req = CloseRequest<I>::create(m_parent_image_ctx, ctx);
+  req->send();
+}
+
+template <typename I>
+Context *RefreshParentRequest<I>::handle_close_parent(int *result) {
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl;
+
+  delete m_parent_image_ctx;
+  if (*result < 0) {
+    lderr(cct) << "failed to close parent image: " << cpp_strerror(*result)
+               << dendl;
+  }
+
+  if (m_error_result < 0) {
+    // propagate errors from opening the image
+    *result = m_error_result;
+  } else {
+    // ignore errors from closing the image
+    *result = 0;
+  }
+
+  return m_on_finish;
+}
+
+template <typename I>
+void RefreshParentRequest<I>::send_complete(int r) {
+  CephContext *cct = m_child_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_on_finish->complete(r);
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RefreshParentRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RefreshParentRequest.h b/src/librbd/image/RefreshParentRequest.h
new file mode 100644
index 0000000..e51d24f
--- /dev/null
+++ b/src/librbd/image/RefreshParentRequest.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/parent_types.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace image {
+
+template <typename ImageCtxT = ImageCtx>
+class RefreshParentRequest {
+public:
+  static RefreshParentRequest *create(ImageCtxT &child_image_ctx,
+                                      const parent_info &parent_md,
+                                      Context *on_finish) {
+    return new RefreshParentRequest(child_image_ctx, parent_md, on_finish);
+  }
+
+  static bool is_refresh_required(ImageCtxT &child_image_ctx,
+                                  const parent_info &parent_md);
+
+  void send();
+  void apply();
+  void finalize(Context *on_finish);
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    | (open required)
+   *    |----------------> OPEN_PARENT * * * * * * * *
+   *    |                     |                      *
+   *    |                     v                      * (on error)
+   *    |                  SET_PARENT_SNAP * * * * * *
+   *    |                     |                      *
+   *    |                     v                      *
+   *    \----------------> <apply>                   *
+   *                          |                      *
+   *                          | (close required)     v
+   *                          |-----------------> CLOSE_PARENT
+   *                          |                      |
+   *                          |                      v
+   *                          \-----------------> <finish>
+   *
+   * @endverbatim
+   */
+
+  RefreshParentRequest(ImageCtxT &child_image_ctx, const parent_info &parent_md,
+                       Context *on_finish);
+
+  ImageCtxT &m_child_image_ctx;
+  parent_info m_parent_md;
+  Context *m_on_finish;
+
+  ImageCtxT *m_parent_image_ctx;
+  uint64_t m_parent_snap_id;
+
+  int m_error_result;
+
+  static bool is_close_required(ImageCtxT &child_image_ctx,
+                                const parent_info &parent_md);
+  static bool is_open_required(ImageCtxT &child_image_ctx,
+                               const parent_info &parent_md);
+
+  void send_open_parent();
+  Context *handle_open_parent(int *result);
+
+  void send_set_parent_snap();
+  Context *handle_set_parent_snap(int *result);
+
+  void send_close_parent();
+  Context *handle_close_parent(int *result);
+
+  void send_complete(int r);
+
+  void save_result(int *result) {
+    if (m_error_result == 0 && *result < 0) {
+      m_error_result = *result;
+    }
+  }
+
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RefreshParentRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H
diff --git a/src/librbd/image/RefreshRequest.cc b/src/librbd/image/RefreshRequest.cc
new file mode 100644
index 0000000..c06dc14
--- /dev/null
+++ b/src/librbd/image/RefreshRequest.cc
@@ -0,0 +1,763 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/RefreshRequest.h"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshParentRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::RefreshRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_rados_ack_callback;
+using util::create_context_callback;
+
+template <typename I>
+RefreshRequest<I>::RefreshRequest(I &image_ctx, Context *on_finish)
+  : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0),
+    m_flush_aio(false), m_exclusive_lock(nullptr), m_object_map(nullptr),
+    m_journal(nullptr), m_refresh_parent(nullptr) {
+}
+
+template <typename I>
+RefreshRequest<I>::~RefreshRequest() {
+  delete m_object_map;
+
+  // these require state machine to close
+  assert(m_exclusive_lock == nullptr);
+  assert(m_journal == nullptr);
+  assert(m_refresh_parent == nullptr);
+}
+
+template <typename I>
+void RefreshRequest<I>::send() {
+  if (m_image_ctx.old_format) {
+    send_v1_read_header();
+  } else {
+    send_v2_get_mutable_metadata();
+  }
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_read_header() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  op.read(0, 0, nullptr, nullptr);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v1_read_header>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_read_header(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+  rbd_obj_header_ondisk v1_header;
+  if (*result < 0) {
+    return m_on_finish;
+  } else if (m_out_bl.length() < sizeof(v1_header)) {
+    lderr(cct) << "v1 header too small" << dendl;
+    *result = -EIO;
+    return m_on_finish;
+  } else if (memcmp(RBD_HEADER_TEXT, m_out_bl.c_str(),
+                    sizeof(RBD_HEADER_TEXT)) != 0) {
+    lderr(cct) << "unrecognized v1 header" << dendl;
+    *result = -ENXIO;
+    return m_on_finish;
+  }
+
+  memcpy(&v1_header, m_out_bl.c_str(), sizeof(v1_header));
+  m_order = v1_header.options.order;
+  m_size = v1_header.image_size;
+  m_object_prefix = v1_header.block_name;
+  send_v1_get_snapshots();
+  return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_get_snapshots() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::old_snapshot_list_start(&op);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v1_get_snapshots>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_get_snapshots(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *result = cls_client::old_snapshot_list_finish(
+      &it, &m_snap_names, &m_snap_sizes, &m_snapc);
+  }
+
+  if (*result < 0) {
+    lderr(cct) << "failed to retrieve v1 snapshots: " << cpp_strerror(*result)
+               << dendl;
+    return m_on_finish;
+  }
+
+  if (!m_snapc.is_valid()) {
+    lderr(cct) << "v1 image snap context is invalid" << dendl;
+    *result = -EIO;
+    return m_on_finish;
+  }
+
+  send_v1_get_locks();
+  return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v1_get_locks() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v1_get_locks>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v1_get_locks(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": "
+                 << "r=" << *result << dendl;
+
+  // If EOPNOTSUPP, treat image as if there are no locks (we can't
+  // query them).
+  if (*result == -EOPNOTSUPP) {
+    *result = 0;
+  } else if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    ClsLockType lock_type;
+    *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+                                                     &lock_type, &m_lock_tag);
+    if (*result == 0) {
+      m_exclusive_locked = (lock_type == LOCK_EXCLUSIVE);
+    }
+  }
+  if (*result < 0) {
+    lderr(cct) << "failed to retrieve locks: " << cpp_strerror(*result)
+               << dendl;
+    return m_on_finish;
+  }
+
+  apply();
+
+  return send_flush_aio();
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_mutable_metadata() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  uint64_t snap_id;
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    snap_id = m_image_ctx.snap_id;
+  }
+
+  bool read_only = m_image_ctx.read_only || snap_id != CEPH_NOSNAP;
+  librados::ObjectReadOperation op;
+  cls_client::get_mutable_metadata_start(&op, read_only);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v2_get_mutable_metadata>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_mutable_metadata(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": "
+                 << "r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *result = cls_client::get_mutable_metadata_finish(&it, &m_size, &m_features,
+                                                      &m_incompatible_features,
+                                                      &m_lockers,
+                                                      &m_exclusive_locked,
+                                                      &m_lock_tag, &m_snapc,
+                                                      &m_parent_md);
+  }
+  if (*result < 0) {
+    lderr(cct) << "failed to retrieve mutable metadata: "
+               << cpp_strerror(*result) << dendl;
+    return m_on_finish;
+  }
+
+  uint64_t unsupported = m_incompatible_features & ~RBD_FEATURES_ALL;
+  if (unsupported != 0ULL) {
+    lderr(cct) << "Image uses unsupported features: " << unsupported << dendl;
+    *result = -ENOSYS;
+    return m_on_finish;
+  }
+
+  send_v2_get_flags();
+  return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_v2_get_flags() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::get_flags_start(&op, m_snapc.snaps);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v2_get_flags>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_flags(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": "
+                 << "r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    cls_client::get_flags_finish(&it, &m_flags, m_snapc.snaps, &m_snap_flags);
+  }
+  if (*result == -EOPNOTSUPP) {
+    // Older OSD doesn't support RBD flags, need to assume the worst
+    *result = 0;
+    ldout(cct, 10) << "OSD does not support RBD flags, disabling object map "
+                   << "optimizations" << dendl;
+    m_flags = RBD_FLAG_OBJECT_MAP_INVALID;
+    if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) {
+      m_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+    }
+
+    std::vector<uint64_t> default_flags(m_snapc.snaps.size(), m_flags);
+    m_snap_flags = std::move(default_flags);
+  } else if (*result == -ENOENT) {
+    ldout(cct, 10) << "out-of-sync snapshot state detected" << dendl;
+    send_v2_get_mutable_metadata();
+    return nullptr;
+  } else if (*result < 0) {
+    lderr(cct) << "failed to retrieve flags: " << cpp_strerror(*result)
+               << dendl;
+    return m_on_finish;
+  }
+
+  return send_v2_get_snapshots();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_get_snapshots() {
+  if (m_snapc.snaps.empty()) {
+    m_snap_names.clear();
+    m_snap_sizes.clear();
+    m_snap_parents.clear();
+    m_snap_protection.clear();
+    return send_v2_refresh_parent();
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::snapshot_list_start(&op, m_snapc.snaps);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *comp = create_rados_ack_callback<
+    klass, &klass::handle_v2_get_snapshots>(this);
+  m_out_bl.clear();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  comp->release();
+  return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_get_snapshots(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": "
+                 << "r=" << *result << dendl;
+
+  if (*result == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *result = cls_client::snapshot_list_finish(&it, m_snapc.snaps,
+                                               &m_snap_names, &m_snap_sizes,
+                                               &m_snap_parents,
+                                               &m_snap_protection);
+  }
+  if (*result == -ENOENT) {
+    ldout(cct, 10) << "out-of-sync snapshot state detected" << dendl;
+    send_v2_get_mutable_metadata();
+    return nullptr;
+  } else if (*result < 0) {
+    lderr(cct) << "failed to retrieve snapshots: " << cpp_strerror(*result)
+               << dendl;
+    return m_on_finish;
+  }
+
+  if (!m_snapc.is_valid()) {
+    lderr(cct) << "image snap context is invalid!" << dendl;
+    *result = -EIO;
+    return m_on_finish;
+  }
+
+  return send_v2_refresh_parent();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_refresh_parent() {
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+
+    parent_info parent_md;
+    int r = get_parent_info(m_image_ctx.snap_id, &parent_md);
+    if (r < 0 ||
+        RefreshParentRequest<I>::is_refresh_required(m_image_ctx, parent_md)) {
+      CephContext *cct = m_image_ctx.cct;
+      ldout(cct, 10) << this << " " << __func__ << dendl;
+
+      using klass = RefreshRequest<I>;
+      Context *ctx = create_context_callback<
+        klass, &klass::handle_v2_refresh_parent>(this);
+      m_refresh_parent = RefreshParentRequest<I>::create(
+        m_image_ctx, parent_md, ctx);
+    }
+  }
+
+  if (m_refresh_parent != nullptr) {
+    m_refresh_parent->send();
+    return nullptr;
+  } else {
+    return send_v2_init_exclusive_lock();
+  }
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_refresh_parent(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to refresh parent image: " << cpp_strerror(*result)
+               << dendl;
+    save_result(result);
+    return send_v2_finalize_refresh_parent();
+  }
+
+  return send_v2_init_exclusive_lock();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_init_exclusive_lock() {
+  if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0 ||
+      !m_image_ctx.snap_name.empty() ||
+      m_image_ctx.exclusive_lock != nullptr) {
+    return send_v2_open_journal();
+  }
+
+  // implies exclusive lock dynamically enabled or image open in-progress
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  // TODO need safe shut down
+  m_exclusive_lock = ExclusiveLock<I>::create(m_image_ctx);
+
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_v2_init_exclusive_lock>(this);
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  m_exclusive_lock->init(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_init_exclusive_lock(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to initialize exclusive lock: "
+               << cpp_strerror(*result) << dendl;
+    save_result(result);
+    return send_v2_finalize_refresh_parent();
+  }
+
+  return send_v2_open_journal();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_open_journal() {
+  if ((m_features & RBD_FEATURE_JOURNALING) == 0 ||
+      m_image_ctx.read_only ||
+      m_image_ctx.journal != nullptr ||
+      m_image_ctx.exclusive_lock == nullptr ||
+      !m_image_ctx.exclusive_lock->is_lock_owner()) {
+    return send_v2_open_object_map();
+  }
+
+  // implies journal dynamically enabled since ExclusiveLock will init
+  // the journal upon acquiring the lock
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_v2_open_journal>(this);
+
+  // TODO need safe close
+  m_journal = new Journal(m_image_ctx);
+  m_journal->open(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_open_journal(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to initialize journal: " << cpp_strerror(*result)
+               << dendl;
+    save_result(result);
+    return send_v2_finalize_refresh_parent();
+  }
+
+  return send_v2_shut_down_exclusive_lock();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_open_object_map() {
+  if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0 ||
+      m_image_ctx.object_map != nullptr || m_image_ctx.snap_name.empty()) {
+    return send_v2_finalize_refresh_parent();
+  }
+
+  // implies object map dynamically enabled or image open in-progress
+  // since SetSnapRequest loads the object map for a snapshot and
+  // ExclusiveLock loads the object map for HEAD
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  for (size_t snap_idx = 0; snap_idx < m_snap_names.size(); ++snap_idx) {
+    if (m_snap_names[snap_idx] == m_image_ctx.snap_name) {
+      using klass = RefreshRequest<I>;
+      Context *ctx = create_context_callback<
+        klass, &klass::handle_v2_open_object_map>(this);
+
+      m_object_map = m_image_ctx.create_object_map(m_snapc.snaps[snap_idx].val);
+      m_object_map->open(ctx);
+      return nullptr;
+    }
+  }
+
+  lderr(cct) << "failed to locate snapshot: " << m_image_ctx.snap_name
+             << dendl;
+  return send_v2_finalize_refresh_parent();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_open_object_map(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  assert(*result == 0);
+  return send_v2_finalize_refresh_parent();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_finalize_refresh_parent() {
+  apply();
+
+  if (m_refresh_parent == nullptr) {
+    return send_v2_shut_down_exclusive_lock();
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_v2_finalize_refresh_parent>(this);
+  m_refresh_parent->finalize(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_finalize_refresh_parent(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  assert(m_refresh_parent != nullptr);
+  delete m_refresh_parent;
+  m_refresh_parent = nullptr;
+
+  return send_v2_shut_down_exclusive_lock();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_shut_down_exclusive_lock() {
+  if (m_exclusive_lock == nullptr) {
+    return send_v2_close_journal();
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  // exclusive lock feature was dynamically disabled
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_v2_shut_down_exclusive_lock>(this);
+  m_exclusive_lock->shut_down(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_shut_down_exclusive_lock(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to shut down exclusive lock: "
+               << cpp_strerror(*result) << dendl;
+    save_result(result);
+  }
+
+  assert(m_exclusive_lock != nullptr);
+  delete m_exclusive_lock;
+  m_exclusive_lock = nullptr;
+
+  return send_v2_close_journal();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_v2_close_journal() {
+  if (m_journal == nullptr) {
+    return send_flush_aio();
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  // journal feature was dynamically disabled
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_v2_close_journal>(this);
+  m_journal->close(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_v2_close_journal(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    save_result(result);
+    lderr(cct) << "failed to close journal: " << cpp_strerror(*result)
+               << dendl;
+  }
+
+  return send_flush_aio();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::send_flush_aio() {
+  if (m_flush_aio) {
+    CephContext *cct = m_image_ctx.cct;
+    ldout(cct, 10) << this << " " << __func__ << dendl;
+
+    RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+    using klass = RefreshRequest<I>;
+    Context *ctx = create_context_callback<
+      klass, &klass::handle_flush_aio>(this);
+    m_image_ctx.flush(ctx);
+    return nullptr;
+  }
+  return m_on_finish;
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_flush_aio(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to flush pending AIO: " << cpp_strerror(*result)
+               << dendl;
+  }
+
+  if (m_error_result < 0) {
+    *result = m_error_result;
+  }
+  return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::apply() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  RWLock::WLocker md_locker(m_image_ctx.md_lock);
+
+  {
+    Mutex::Locker cache_locker(m_image_ctx.cache_lock);
+    RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+    RWLock::WLocker parent_locker(m_image_ctx.parent_lock);
+
+    m_image_ctx.size = m_size;
+    m_image_ctx.lockers = m_lockers;
+    m_image_ctx.lock_tag = m_lock_tag;
+    m_image_ctx.exclusive_locked = m_exclusive_locked;
+
+    if (m_image_ctx.old_format) {
+      m_image_ctx.order = m_order;
+      m_image_ctx.features = 0;
+      m_image_ctx.flags = 0;
+      m_image_ctx.object_prefix = std::move(m_object_prefix);
+      m_image_ctx.init_layout();
+    } else {
+      m_image_ctx.features = m_features;
+      m_image_ctx.flags = m_flags;
+      m_image_ctx.parent_md = m_parent_md;
+    }
+
+    for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+      std::vector<librados::snap_t>::const_iterator it = std::find(
+        m_image_ctx.snaps.begin(), m_image_ctx.snaps.end(),
+        m_snapc.snaps[i].val);
+      if (it == m_image_ctx.snaps.end()) {
+        m_flush_aio = true;
+        ldout(cct, 20) << "new snapshot id=" << m_snapc.snaps[i].val
+                       << " name=" << m_snap_names[i]
+                       << " size=" << m_snap_sizes[i]
+                       << dendl;
+      }
+    }
+
+    m_image_ctx.snaps.clear();
+    m_image_ctx.snap_info.clear();
+    m_image_ctx.snap_ids.clear();
+    for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+      uint64_t flags = m_image_ctx.old_format ? 0 : m_snap_flags[i];
+      uint8_t protection_status = m_image_ctx.old_format ?
+        static_cast<uint8_t>(RBD_PROTECTION_STATUS_UNPROTECTED) :
+        m_snap_protection[i];
+      parent_info parent;
+      if (!m_image_ctx.old_format) {
+        parent = m_snap_parents[i];
+      }
+
+      m_image_ctx.add_snap(m_snap_names[i], m_snapc.snaps[i].val,
+                           m_snap_sizes[i], parent, protection_status, flags);
+    }
+    m_image_ctx.snapc = m_snapc;
+
+    if (m_image_ctx.snap_id != CEPH_NOSNAP &&
+        m_image_ctx.get_snap_id(m_image_ctx.snap_name) != m_image_ctx.snap_id) {
+      lderr(cct) << "tried to read from a snapshot that no longer exists: "
+                 << m_image_ctx.snap_name << dendl;
+      m_image_ctx.snap_exists = false;
+    }
+
+    if (m_refresh_parent != nullptr) {
+      m_refresh_parent->apply();
+    }
+    m_image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(m_image_ctx.snapc.seq,
+                                                        m_image_ctx.snaps);
+
+    // handle dynamically enabled / disabled features
+    if (!m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK,
+                                   m_image_ctx.snap_lock) ||
+        m_exclusive_lock != nullptr) {
+      std::swap(m_exclusive_lock, m_image_ctx.exclusive_lock);
+    }
+    if (!m_image_ctx.test_features(RBD_FEATURE_JOURNALING,
+                                   m_image_ctx.snap_lock) ||
+        m_journal != nullptr) {
+      std::swap(m_journal, m_image_ctx.journal);
+    }
+    if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP,
+                                   m_image_ctx.snap_lock) ||
+        m_object_map != nullptr) {
+      std::swap(m_object_map, m_image_ctx.object_map);
+    }
+  }
+}
+
+template <typename I>
+int RefreshRequest<I>::get_parent_info(uint64_t snap_id,
+                                       parent_info *parent_md) {
+  if (snap_id == CEPH_NOSNAP) {
+    *parent_md = m_parent_md;
+    return 0;
+  } else {
+    for (size_t i = 0; i < m_snapc.snaps.size(); ++i) {
+      if (m_snapc.snaps[i].val == snap_id) {
+        *parent_md = m_snap_parents[i];
+        return 0;
+      }
+    }
+  }
+  return -ENOENT;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::RefreshRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/RefreshRequest.h b/src/librbd/image/RefreshRequest.h
new file mode 100644
index 0000000..25eda66
--- /dev/null
+++ b/src/librbd/image/RefreshRequest.h
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rbd_types.h"
+#include "common/snap_types.h"
+#include "cls/lock/cls_lock_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/parent_types.h"
+#include <string>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+template <typename> class ExclusiveLock;
+class ImageCtx;
+class Journal;
+class ObjectMap;
+
+namespace image {
+
+template<typename> class RefreshParentRequest;
+
+template<typename ImageCtxT = ImageCtx>
+class RefreshRequest {
+public:
+  static  RefreshRequest *create(ImageCtxT &image_ctx, Context *on_finish) {
+    return new RefreshRequest(image_ctx, on_finish);
+  }
+
+  ~RefreshRequest();
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    | (v1)
+   *    |-----> V1_READ_HEADER ---> V1_GET_SNAPSHOTS ---> V1_GET_LOCKS
+   *    |                                                     |
+   *    | (v2)                                                v
+   *    \-----> V2_GET_MUTABLE_METADATA                    <apply>
+   *                |                                         |
+   *                v                                         |
+   *            V2_GET_FLAGS                                  |
+   *                |                                         |
+   *                v                                         |
+   *            V2_GET_SNAPSHOTS (skip if no snaps)           |
+   *                |                                         |
+   *                v                                         |
+   *            V2_REFRESH_PARENT (skip if no parent or       |
+   *                |              refresh not needed)        |
+   *                v                                         |
+   *            V2_INIT_EXCLUSIVE_LOCK (skip if lock          |
+   *                |                   active or disabled)   |
+   *                v                                         |
+   *            V2_OPEN_JOURNAL (skip if journal              |
+   *                |            active or disabled)          |
+   *                v                                         |
+   *            V2_OPEN_OBJECT_MAP (skip if map               |
+   *                |               active or disabled)       |
+   *                v                                         |
+   *             <apply>                                      |
+   *                |                                         |
+   *                v                                         |
+   *            V2_FINALIZE_REFRESH_PARENT (skip if refresh   |
+   *                |                       not needed)       |
+   *  (error)       v                                         |
+   *  * * * * > V2_SHUT_DOWN_EXCLUSIVE_LOCK (skip if lock     |
+   *                |                      active or enabled) |
+   *                v                                         |
+   *            V2_CLOSE_JOURNAL (skip if journal inactive    |
+   *                |             or enabled)                 |
+   *                v                                         |
+   *            V2_CLOSE_OBJECT_MAP (skip if map inactive     |
+   *                |                or enabled)              |
+   *                |                                         |
+   *                \-------------------\/--------------------/
+   *                                    |
+   *                                    v
+   *                                  FLUSH (skip if no new
+   *                                    |    snapshots)
+   *                                    v
+   *                                 <finish>
+   *
+   * @endverbatim
+   */
+
+  ImageCtxT &m_image_ctx;
+  Context *m_on_finish;
+
+  int m_error_result;
+  bool m_flush_aio;
+  decltype(m_image_ctx.exclusive_lock) m_exclusive_lock;
+  decltype(m_image_ctx.object_map) m_object_map;
+  decltype(m_image_ctx.journal) m_journal;
+  RefreshParentRequest<ImageCtxT> *m_refresh_parent;
+
+  bufferlist m_out_bl;
+
+  uint8_t m_order;
+  uint64_t m_size;
+  uint64_t m_features;
+  uint64_t m_incompatible_features;
+  uint64_t m_flags;
+  std::string m_object_prefix;
+  parent_info m_parent_md;
+
+  ::SnapContext m_snapc;
+  std::vector<std::string> m_snap_names;
+  std::vector<uint64_t> m_snap_sizes;
+  std::vector<parent_info> m_snap_parents;
+  std::vector<uint8_t> m_snap_protection;
+  std::vector<uint64_t> m_snap_flags;
+
+  std::map<rados::cls::lock::locker_id_t,
+           rados::cls::lock::locker_info_t> m_lockers;
+  std::string m_lock_tag;
+  bool m_exclusive_locked;
+
+  RefreshRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+  void send_v1_read_header();
+  Context *handle_v1_read_header(int *result);
+
+  void send_v1_get_snapshots();
+  Context *handle_v1_get_snapshots(int *result);
+
+  void send_v1_get_locks();
+  Context *handle_v1_get_locks(int *result);
+
+  void send_v2_get_mutable_metadata();
+  Context *handle_v2_get_mutable_metadata(int *result);
+
+  void send_v2_get_flags();
+  Context *handle_v2_get_flags(int *result);
+
+  Context *send_v2_get_snapshots();
+  Context *handle_v2_get_snapshots(int *result);
+
+  Context *send_v2_refresh_parent();
+  Context *handle_v2_refresh_parent(int *result);
+
+  Context *send_v2_init_exclusive_lock();
+  Context *handle_v2_init_exclusive_lock(int *result);
+
+  Context *send_v2_open_journal();
+  Context *handle_v2_open_journal(int *result);
+
+  Context *send_v2_open_object_map();
+  Context *handle_v2_open_object_map(int *result);
+
+  Context *send_v2_finalize_refresh_parent();
+  Context *handle_v2_finalize_refresh_parent(int *result);
+
+  Context *send_v2_shut_down_exclusive_lock();
+  Context *handle_v2_shut_down_exclusive_lock(int *result);
+
+  Context *send_v2_close_journal();
+  Context *handle_v2_close_journal(int *result);
+
+  Context *send_flush_aio();
+  Context *handle_flush_aio(int *result);
+
+  void save_result(int *result) {
+    if (m_error_result == 0 && *result < 0) {
+      m_error_result = *result;
+    }
+  }
+
+  void apply();
+  int get_parent_info(uint64_t snap_id, parent_info *parent_md);
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::RefreshRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H
diff --git a/src/librbd/image/SetSnapRequest.cc b/src/librbd/image/SetSnapRequest.cc
new file mode 100644
index 0000000..9e175b2
--- /dev/null
+++ b/src/librbd/image/SetSnapRequest.cc
@@ -0,0 +1,342 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/image/SetSnapRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/image/RefreshParentRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::image::SetSnapRequest: "
+
+namespace librbd {
+namespace image {
+
+using util::create_context_callback;
+
+template <typename I>
+SetSnapRequest<I>::SetSnapRequest(I &image_ctx, const std::string &snap_name,
+                                  Context *on_finish)
+  : m_image_ctx(image_ctx), m_snap_name(snap_name), m_on_finish(on_finish),
+    m_snap_id(CEPH_NOSNAP), m_exclusive_lock(nullptr), m_object_map(nullptr),
+    m_refresh_parent(nullptr), m_writes_blocked(false) {
+}
+
+template <typename I>
+SetSnapRequest<I>::~SetSnapRequest() {
+  delete m_refresh_parent;
+  delete m_object_map;
+  delete m_exclusive_lock;
+  if (m_writes_blocked) {
+    m_image_ctx.aio_work_queue->unblock_writes();
+  }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send() {
+  if (m_snap_name.empty()) {
+    send_init_exclusive_lock();
+  } else {
+    send_block_writes();
+  }
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_init_exclusive_lock() {
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    if (m_image_ctx.exclusive_lock != nullptr) {
+      assert(m_image_ctx.snap_id == CEPH_NOSNAP);
+      send_complete();
+      return;
+    }
+  }
+
+  if (!m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    int r = 0;
+    if (send_refresh_parent(&r) != nullptr) {
+      send_complete();
+      return;
+    }
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  m_exclusive_lock = ExclusiveLock<I>::create(m_image_ctx);
+
+  using klass = SetSnapRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_init_exclusive_lock>(this);
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  m_exclusive_lock->init(ctx);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_init_exclusive_lock(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to initialize exclusive lock: "
+               << cpp_strerror(*result) << dendl;
+    return m_on_finish;
+  }
+  return send_refresh_parent(result);
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_block_writes() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  m_writes_blocked = true;
+
+  using klass = SetSnapRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_block_writes>(this);
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  m_image_ctx.aio_work_queue->block_writes(ctx);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_block_writes(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to block writes: " << cpp_strerror(*result)
+               << dendl;
+    return m_on_finish;
+  }
+
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    m_snap_id = m_image_ctx.get_snap_id(m_snap_name);
+    if (m_snap_id == CEPH_NOSNAP) {
+      ldout(cct, 5) << "failed to locate snapshot '" << m_snap_name << "'"
+                    << dendl;
+
+      *result = -ENOENT;
+      return m_on_finish;
+    }
+  }
+
+  return send_shut_down_exclusive_lock(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_shut_down_exclusive_lock(int *result) {
+  ExclusiveLock<I> *exclusive_lock;
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    exclusive_lock = m_image_ctx.exclusive_lock;
+  }
+
+  if (exclusive_lock == nullptr) {
+    return send_refresh_parent(result);
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = SetSnapRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_shut_down_exclusive_lock>(this);
+  exclusive_lock->shut_down(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_shut_down_exclusive_lock(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to shut down exclusive lock: "
+               << cpp_strerror(*result) << dendl;
+    return m_on_finish;
+  }
+
+  return send_refresh_parent(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_refresh_parent(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+
+  parent_info parent_md;
+  bool refresh_parent;
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+
+    const parent_info *parent_info = m_image_ctx.get_parent_info(m_snap_id);
+    if (parent_info == nullptr) {
+      *result = -ENOENT;
+      lderr(cct) << "failed to retrieve snapshot parent info" << dendl;
+      return m_on_finish;
+    }
+
+    parent_md = *parent_info;
+    refresh_parent = RefreshParentRequest<I>::is_refresh_required(m_image_ctx,
+                                                                  parent_md);
+  }
+
+  if (!refresh_parent) {
+    if (m_snap_id == CEPH_NOSNAP) {
+      // object map is loaded when exclusive lock is acquired
+      *result = apply();
+      return m_on_finish;
+    } else {
+      // load snapshot object map
+      return send_open_object_map(result);
+    }
+  }
+
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = SetSnapRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_refresh_parent>(this);
+  m_refresh_parent = RefreshParentRequest<I>::create(m_image_ctx, parent_md,
+                                                     ctx);
+  m_refresh_parent->send();
+  return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_refresh_parent(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to refresh snapshot parent: " << cpp_strerror(*result)
+               << dendl;
+    return m_on_finish;
+  }
+
+  if (m_snap_id == CEPH_NOSNAP) {
+    // object map is loaded when exclusive lock is acquired
+    *result = apply();
+    if (*result < 0) {
+      return m_on_finish;
+    }
+
+    return send_finalize_refresh_parent(result);
+  } else {
+    // load snapshot object map
+    return send_open_object_map(result);
+  }
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_open_object_map(int *result) {
+  if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) {
+    *result = apply();
+    if (*result < 0) {
+      return m_on_finish;
+    }
+
+    return send_finalize_refresh_parent(result);
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  using klass = SetSnapRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_open_object_map>(this);
+  m_object_map = new ObjectMap(m_image_ctx, m_snap_id);
+  m_object_map->open(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_open_object_map(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << ": r=" << *result << dendl;
+
+  // object map should never report errors
+  assert(*result == 0);
+
+  *result = apply();
+  if (*result < 0) {
+    return m_on_finish;
+  }
+
+  return send_finalize_refresh_parent(result);
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::send_finalize_refresh_parent(int *result) {
+  if (m_refresh_parent == nullptr) {
+    return m_on_finish;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  using klass = SetSnapRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_finalize_refresh_parent>(this);
+  m_refresh_parent->finalize(ctx);
+  return nullptr;
+}
+
+template <typename I>
+Context *SetSnapRequest<I>::handle_finalize_refresh_parent(int *result) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl;
+
+  if (*result < 0) {
+    lderr(cct) << "failed to close parent image: " << cpp_strerror(*result)
+               << dendl;
+  }
+  return m_on_finish;
+}
+
+template <typename I>
+int SetSnapRequest<I>::apply() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << __func__ << dendl;
+
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  RWLock::WLocker parent_locker(m_image_ctx.parent_lock);
+
+  if (m_snap_id != CEPH_NOSNAP) {
+    int r = m_image_ctx.snap_set(m_snap_name);
+    if (r < 0) {
+      return r;
+    }
+  } else {
+    m_image_ctx.snap_unset();
+  }
+
+  if (m_refresh_parent != nullptr) {
+    m_refresh_parent->apply();
+  }
+
+  std::swap(m_exclusive_lock, m_image_ctx.exclusive_lock);
+  std::swap(m_object_map, m_image_ctx.object_map);
+  return 0;
+}
+
+template <typename I>
+void SetSnapRequest<I>::send_complete() {
+  m_on_finish->complete(0);
+  delete this;
+}
+
+} // namespace image
+} // namespace librbd
+
+template class librbd::image::SetSnapRequest<librbd::ImageCtx>;
diff --git a/src/librbd/image/SetSnapRequest.h b/src/librbd/image/SetSnapRequest.h
new file mode 100644
index 0000000..815614e
--- /dev/null
+++ b/src/librbd/image/SetSnapRequest.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
+#define CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/parent_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+template <typename> class ExclusiveLock;
+class ImageCtx;
+class ObjectMap;
+
+namespace image {
+
+template <typename> class RefreshParentRequest;
+
+template <typename ImageCtxT = ImageCtx>
+class SetSnapRequest {
+public:
+  static SetSnapRequest *create(ImageCtxT &image_ctx,
+                                const std::string &snap_name,
+                                Context *on_finish) {
+    return new SetSnapRequest(image_ctx, snap_name, on_finish);
+  }
+
+  ~SetSnapRequest();
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    | (set snap)
+   *    |-----------> BLOCK_WRITES
+   *    |                 |
+   *    |                 v
+   *    |             SHUTDOWN_EXCLUSIVE_LOCK (skip if lock inactive
+   *    |                 |                    or disabled)
+   *    |                 v
+   *    |             REFRESH_PARENT (skip if no parent
+   *    |                 |           or refresh not needed)
+   *    |                 v
+   *    |             OPEN_OBJECT_MAP (skip if map disabled)
+   *    |                 |
+   *    |                 v
+   *    |              <apply>
+   *    |                 |
+   *    |                 v
+   *    |             FINALIZE_REFRESH_PARENT (skip if no parent
+   *    |                 |                    or refresh not needed)
+   *    |                 v
+   *    |             <finish>
+   *    |
+   *    \-----------> INIT_EXCLUSIVE_LOCK (skip if active or
+   *                      |                disabled)
+   *                      v
+   *                  REFRESH_PARENT (skip if no parent
+   *                      |           or refresh not needed)
+   *                      v
+   *                   <apply>
+   *                      |
+   *                      v
+   *                  FINALIZE_REFRESH_PARENT (skip if no parent
+   *                      |                    or refresh not needed)
+   *                      v
+   *                  <finish>
+   *
+   * @endverbatim
+   */
+
+  SetSnapRequest(ImageCtxT &image_ctx, const std::string &snap_name,
+                Context *on_finish);
+
+  ImageCtxT &m_image_ctx;
+  std::string m_snap_name;
+  Context *m_on_finish;
+
+  uint64_t m_snap_id;
+  ExclusiveLock<ImageCtxT> *m_exclusive_lock;
+  ObjectMap *m_object_map;
+  RefreshParentRequest<ImageCtxT> *m_refresh_parent;
+
+  bool m_writes_blocked;
+
+  void send_block_writes();
+  Context *handle_block_writes(int *result);
+
+  void send_init_exclusive_lock();
+  Context *handle_init_exclusive_lock(int *result);
+
+  Context *send_shut_down_exclusive_lock(int *result);
+  Context *handle_shut_down_exclusive_lock(int *result);
+
+  Context *send_refresh_parent(int *result);
+  Context *handle_refresh_parent(int *result);
+
+  Context *send_open_object_map(int *result);
+  Context *handle_open_object_map(int *result);
+
+  Context *send_finalize_refresh_parent(int *result);
+  Context *handle_finalize_refresh_parent(int *result);
+
+  int apply();
+  void send_complete();
+};
+
+} // namespace image
+} // namespace librbd
+
+extern template class librbd::image::SetSnapRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index d925b42..4172f45 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -5,12 +5,14 @@
 #include <errno.h>
 #include <limits.h>
 
+#include "include/types.h"
 #include "common/ceph_context.h"
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/ContextCompletion.h"
 #include "common/Throttle.h"
 #include "common/WorkQueue.h"
+#include "common/event_socket.h"
 #include "cls/lock/cls_lock_client.h"
 #include "include/stringify.h"
 
@@ -21,18 +23,28 @@
 #include "librbd/AioImageRequest.h"
 #include "librbd/AioImageRequestWQ.h"
 #include "librbd/AioObjectRequest.h"
-#include "librbd/AsyncFlattenRequest.h"
-#include "librbd/AsyncResizeRequest.h"
-#include "librbd/AsyncTrimRequest.h"
 #include "librbd/CopyupRequest.h"
 #include "librbd/DiffIterate.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
 #include "librbd/Journal.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/parent_types.h"
-#include "librbd/RebuildObjectMapRequest.h"
+#include "librbd/Utils.h"
+#include "librbd/operation/FlattenRequest.h"
+#include "librbd/operation/RebuildObjectMapRequest.h"
+#include "librbd/operation/RenameRequest.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "librbd/operation/SnapshotRenameRequest.h"
+#include "librbd/operation/SnapshotRollbackRequest.h"
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "librbd/operation/TrimRequest.h"
 #include "include/util.h"
 
 #include <boost/bind.hpp>
@@ -143,34 +155,33 @@ int prepare_image_update(ImageCtx *ictx) {
   assert(ictx->owner_lock.is_locked() && !ictx->owner_lock.is_wlocked());
   if (ictx->image_watcher == NULL) {
     return -EROFS;
-  } else if (!ictx->image_watcher->is_lock_supported() ||
-             ictx->image_watcher->is_lock_owner()) {
-    return 0;
   }
 
   // need to upgrade to a write lock
   int r = 0;
-  bool acquired_lock = false;
+  bool trying_lock = false;
+  C_SaferCond ctx;
   ictx->owner_lock.put_read();
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    if (!ictx->image_watcher->is_lock_owner()) {
-      r = ictx->image_watcher->try_lock();
-      acquired_lock = ictx->image_watcher->is_lock_owner();
+    RWLock::WLocker owner_locker(ictx->owner_lock);
+    if (ictx->exclusive_lock != nullptr &&
+        !ictx->exclusive_lock->is_lock_owner()) {
+      ictx->exclusive_lock->try_lock(&ctx);
+      trying_lock = true;
     }
   }
-  if (acquired_lock) {
-    // finish any AIO that was previously waiting on acquiring the
-    // exclusive lock
-    ictx->flush_async_operations();
+
+  if (trying_lock) {
+    r = ctx.wait();
   }
   ictx->owner_lock.get_read();
+
   return r;
 }
 
 int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
                          bool permit_snapshot,
-                         const boost::function<int(Context*)>& local_request,
+                         const boost::function<void(Context*)>& local_request,
                          const boost::function<int()>& remote_request) {
   int r;
   do {
@@ -185,11 +196,11 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
         }
       }
 
-      while (ictx->image_watcher->is_lock_supported()) {
+      while (ictx->exclusive_lock != nullptr) {
         r = prepare_image_update(ictx);
         if (r < 0) {
           return -EROFS;
-        } else if (ictx->image_watcher->is_lock_owner()) {
+        } else if (ictx->exclusive_lock->is_lock_owner()) {
           break;
         }
 
@@ -201,10 +212,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
                             << dendl;
       }
 
-      r = local_request(&ctx);
-      if (r < 0) {
-        return r;
-      }
+      local_request(&ctx);
     }
 
     r = ctx.wait();
@@ -216,26 +224,42 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
   return r;
 }
 
-} // anonymous namespace
-
-  const string id_obj_name(const string &name)
-  {
-    return RBD_ID_PREFIX + name;
+int validate_pool(IoCtx &io_ctx, CephContext *cct) {
+  if (!cct->_conf->rbd_validate_pool) {
+    return 0;
   }
 
-  const string header_name(const string &image_id)
-  {
-    return RBD_HEADER_PREFIX + image_id;
+  int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
+  if (r == 0) {
+    return 0;
+  } else if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
+    return r;
   }
 
-  const string old_header_name(const string &image_name)
-  {
-    return image_name + RBD_SUFFIX;
+  // allocate a self-managed snapshot id if this a new pool to force
+  // self-managed snapshot mode
+  uint64_t snap_id;
+  r = io_ctx.selfmanaged_snap_create(&snap_id);
+  if (r == -EINVAL) {
+    lderr(cct) << "pool not configured for self-managed RBD snapshot support"
+               << dendl;
+    return r;
+  } else if (r < 0) {
+    lderr(cct) << "failed to allocate self-managed snapshot: "
+               << cpp_strerror(r) << dendl;
+    return r;
   }
 
-  std::string unique_lock_name(const std::string &name, void *address) {
-    return name + " (" + stringify(address) + ")";
+  r = io_ctx.selfmanaged_snap_remove(snap_id);
+  if (r < 0) {
+    lderr(cct) << "failed to release self-managed snapshot " << snap_id
+               << ": " << cpp_strerror(r) << dendl;
   }
+  return 0;
+}
+
+} // anonymous namespace
 
   int detect_format(IoCtx &io_ctx, const string &name,
 		    bool *old_format, uint64_t *size)
@@ -243,11 +267,11 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     CephContext *cct = (CephContext *)io_ctx.cct();
     if (old_format)
       *old_format = true;
-    int r = io_ctx.stat(old_header_name(name), size, NULL);
+    int r = io_ctx.stat(util::old_header_name(name), size, NULL);
     if (r == -ENOENT) {
       if (old_format)
 	*old_format = false;
-      r = io_ctx.stat(id_obj_name(name), size, NULL);
+      r = io_ctx.stat(util::id_obj_name(name), size, NULL);
       if (r < 0)
 	return r;
     } else if (r < 0) {
@@ -321,13 +345,13 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
   void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
   {
     assert(ictx->owner_lock.is_locked());
-    assert(!ictx->image_watcher->is_lock_supported() ||
-	   ictx->image_watcher->is_lock_owner());
+    assert(ictx->exclusive_lock == nullptr ||
+	   ictx->exclusive_lock->is_lock_owner());
 
     C_SaferCond ctx;
     ictx->snap_lock.get_read();
-    AsyncTrimRequest *req = new AsyncTrimRequest(*ictx, &ctx, ictx->size,
-						 newsize, prog_ctx);
+    operation::TrimRequest<> *req = new operation::TrimRequest<>(
+      *ictx, &ctx, ictx->size, newsize, prog_ctx);
     ictx->snap_lock.put_read();
     req->send();
 
@@ -368,11 +392,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
   int notify_change(IoCtx& io_ctx, const string& oid, ImageCtx *ictx)
   {
     if (ictx) {
-      ictx->refresh_lock.Lock();
-      ldout(ictx->cct, 20) << "notify_change refresh_seq = " << ictx->refresh_seq
-			   << " last_refresh = " << ictx->last_refresh << dendl;
-      ++ictx->refresh_seq;
-      ictx->refresh_lock.Unlock();
+      ictx->state->handle_update_notification();
     }
 
     ImageWatcher::notify_header_update(io_ctx, oid);
@@ -427,8 +447,75 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     {RBD_IMAGE_OPTION_ORDER, UINT64},
     {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
     {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
+    {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
+    {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
+    {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
   };
 
+  std::string image_option_name(int optname) {
+    switch (optname) {
+    case RBD_IMAGE_OPTION_FORMAT:
+      return "format";
+    case RBD_IMAGE_OPTION_FEATURES:
+      return "features";
+    case RBD_IMAGE_OPTION_ORDER:
+      return "order";
+    case RBD_IMAGE_OPTION_STRIPE_UNIT:
+      return "stripe_unit";
+    case RBD_IMAGE_OPTION_STRIPE_COUNT:
+      return "stripe_count";
+    case RBD_IMAGE_OPTION_JOURNAL_ORDER:
+      return "journal_order";
+    case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
+      return "journal_splay_width";
+    case RBD_IMAGE_OPTION_JOURNAL_POOL:
+      return "journal_pool";
+    default:
+      return "unknown (" + stringify(optname) + ")";
+    }
+  }
+
+  std::ostream &operator<<(std::ostream &os, rbd_image_options_t &opts) {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    os << "[";
+
+    for (image_options_t::const_iterator i = (*opts_)->begin();
+	 i != (*opts_)->end(); i++) {
+      os << (i == (*opts_)->begin() ? "" : ", ") << image_option_name(i->first)
+	 << "=" << i->second;
+    }
+
+    os << "]";
+
+    return os;
+  }
+
+  std::ostream &operator<<(std::ostream &os, ImageOptions &opts) {
+    os << "[";
+
+    const char *delimiter = "";
+    for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
+      if (i.second == STR) {
+	std::string val;
+	if (opts.get(i.first, &val) == 0) {
+	  os << delimiter << image_option_name(i.first) << "=" << val;
+	  delimiter = ", ";
+	}
+      } else if (i.second == UINT64) {
+	uint64_t val;
+	if (opts.get(i.first, &val) == 0) {
+	  os << delimiter << image_option_name(i.first) << "=" << val;
+	  delimiter = ", ";
+	}
+      }
+    }
+
+    os << "]";
+
+    return os;
+  }
+
   void image_options_create(rbd_image_options_t* opts)
   {
     image_options_ref* opts_ = new image_options_ref(new image_options_t());
@@ -562,54 +649,6 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return (*opts_)->empty();
   }
 
-  void rollback_object(ImageCtx *ictx, uint64_t snap_id, const string& oid,
-		       SimpleThrottle& throttle)
-  {
-    Context *req_comp = new C_SimpleThrottle(&throttle);
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb);
-    librados::ObjectWriteOperation op;
-    op.selfmanaged_snap_rollback(snap_id);
-    ictx->data_ctx.aio_operate(oid, rados_completion, &op);
-    ldout(ictx->cct, 10) << "scheduling selfmanaged_snap_rollback on "
-                         << oid << " to " << snap_id << dendl;
-    rados_completion->release();
-  }
-
-  int rollback_image(ImageCtx *ictx, uint64_t snap_id,
-		     ProgressContext& prog_ctx)
-  {
-    uint64_t bsize = ictx->get_object_size();
-    uint64_t numseg;
-    {
-      RWLock::RLocker l(ictx->snap_lock);
-      numseg = Striper::get_num_objects(ictx->layout, ictx->get_current_size());
-    }
-
-    int r;
-    CephContext *cct = ictx->cct;
-    SimpleThrottle throttle(ictx->concurrent_management_ops, true);
-
-    for (uint64_t i = 0; i < numseg; i++) {
-      string oid = ictx->get_object_name(i);
-      rollback_object(ictx, snap_id, ictx->get_object_name(i), throttle);
-      prog_ctx.update_progress(i * bsize, numseg * bsize);
-    }
-
-    r = throttle.wait_for_ret();
-    if (r < 0) {
-      ldout(cct, 10) << "failed to rollback at least one object: "
-		     << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    {
-      RWLock::WLocker l(ictx->snap_lock);
-      ictx->object_map.rollback(snap_id);
-    }
-    return 0;
-  }
-
   int list(IoCtx& io_ctx, vector<string>& names)
   {
     CephContext *cct = (CephContext *)io_ctx.cct();
@@ -657,7 +696,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "children list " << ictx->name << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -735,7 +774,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
       return -EROFS;
     }
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -760,38 +799,104 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return 0;
   }
 
-  int snap_create_helper(ImageCtx* ictx, Context* ctx,
-                         const char* snap_name) {
+  void snap_create_helper(ImageCtx* ictx, Context* ctx, const char* snap_name) {
     assert(ictx->owner_lock.is_locked());
-    assert(!ictx->image_watcher->is_lock_supported() ||
-	   ictx->image_watcher->is_lock_owner());
+    assert(ictx->exclusive_lock == nullptr ||
+	   ictx->exclusive_lock->is_lock_owner());
 
     ldout(ictx->cct, 20) << "snap_create_helper " << ictx << " " << snap_name
                          << dendl;
 
-    int r = ictx_check(ictx, ictx->owner_lock);
-    if (r < 0) {
+    operation::SnapshotCreateRequest<> *req =
+      new operation::SnapshotCreateRequest<>(*ictx, ctx, snap_name);
+    req->send();
+  }
+
+  int snap_remove(ImageCtx *ictx, const char *snap_name)
+  {
+    ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_name << dendl;
+
+    if (ictx->read_only)
+      return -EROFS;
+
+    int r = ictx->state->refresh_if_required();
+    if (r < 0)
       return r;
+
+    bool proxy_op = false;
+    {
+      RWLock::RLocker snap_locker(ictx->snap_lock);
+      if (ictx->get_snap_id(snap_name) == CEPH_NOSNAP) {
+        return -ENOENT;
+      }
+      proxy_op = ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0 ||
+                  (ictx->features & RBD_FEATURE_JOURNALING) != 0);
     }
 
-    RWLock::WLocker md_locker(ictx->md_lock);
-    r = ictx->flush();
-    if (r < 0) {
-      return r;
+    if (proxy_op) {
+      r = invoke_async_request(ictx, "snap_remove", true,
+                               boost::bind(&snap_remove_helper, ictx, _1,
+                                           snap_name),
+                               boost::bind(&ImageWatcher::notify_snap_remove,
+                                           ictx->image_watcher, snap_name));
+      if (r < 0 && r != -ENOENT) {
+        return r;
+      }
+    } else {
+      RWLock::RLocker owner_lock(ictx->owner_lock);
+      C_SaferCond cond_ctx;
+      snap_remove_helper(ictx, &cond_ctx, snap_name);
+
+      r = cond_ctx.wait();
+      if (r < 0) {
+        return r;
+      }
     }
 
-    do {
-      r = add_snap(ictx, snap_name);
-    } while (r == -ESTALE);
+    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
 
-    if (r < 0) {
-      return r;
+    ictx->perfcounter->inc(l_librbd_snap_remove);
+    return 0;
+  }
+
+  void snap_remove_helper(ImageCtx *ictx, Context *ctx, const char *snap_name)
+  {
+    assert(ictx->owner_lock.is_locked());
+    {
+      if ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0) {
+        assert(ictx->exclusive_lock == nullptr ||
+	       ictx->exclusive_lock->is_lock_owner());
+      }
     }
 
-    if (ctx != NULL) {
-      ctx->complete(0);
+    ldout(ictx->cct, 20) << "snap_remove_helper " << ictx << " " << snap_name
+                         << dendl;
+
+    uint64_t snap_id;
+    {
+      RWLock::RLocker snap_locker(ictx->snap_lock);
+      snap_id = ictx->get_snap_id(snap_name);
+      if (snap_id == CEPH_NOSNAP) {
+        lderr(ictx->cct) << "No such snapshot found." << dendl;
+        ctx->complete(-ENOENT);
+        return;
+      }
+
+      bool is_protected;
+      int r = ictx->is_snap_protected(snap_id, &is_protected);
+      if (r < 0) {
+        ctx->complete(r);
+        return;
+      } else if (is_protected) {
+        lderr(ictx->cct) << "snapshot is protected" << dendl;
+        ctx->complete(-EBUSY);
+        return;
+      }
     }
-    return 0;
+
+    operation::SnapshotRemoveRequest<> *req =
+      new operation::SnapshotRemoveRequest<>(*ictx, ctx, snap_name, snap_id);
+    req->send();
   }
 
   int snap_rename(ImageCtx *ictx, const char *srcname, const char *dstname)
@@ -803,7 +908,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
       return -EROFS;
     }
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -818,14 +923,25 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
       }
     }
 
-    r = invoke_async_request(ictx, "snap_rename", true,
-                             boost::bind(&snap_rename_helper, ictx, _1,
-                                         snap_id, dstname),
-                             boost::bind(&ImageWatcher::notify_snap_rename,
-                                         ictx->image_watcher, snap_id,
-					 dstname));
-    if (r < 0 && r != -EEXIST) {
-      return r;
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      r = invoke_async_request(ictx, "snap_rename", true,
+                               boost::bind(&snap_rename_helper, ictx, _1,
+                                           snap_id, dstname),
+                               boost::bind(&ImageWatcher::notify_snap_rename,
+                                           ictx->image_watcher, snap_id,
+                                           dstname));
+      if (r < 0 && r != -EEXIST) {
+        return r;
+      }
+    } else {
+      RWLock::RLocker owner_lock(ictx->owner_lock);
+      C_SaferCond cond_ctx;
+      snap_rename_helper(ictx, &cond_ctx, snap_id, dstname);
+
+      r = cond_ctx.wait();
+      if (r < 0) {
+        return r;
+      }
     }
 
     ictx->perfcounter->inc(l_librbd_snap_rename);
@@ -833,339 +949,164 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return 0;
   }
 
-  int snap_rename_helper(ImageCtx* ictx, Context* ctx,
-                         const uint64_t src_snap_id,
-                         const char* dst_name) {
+  void snap_rename_helper(ImageCtx* ictx, Context* ctx,
+                          const uint64_t src_snap_id, const char* dst_name) {
     assert(ictx->owner_lock.is_locked());
-    assert(!ictx->image_watcher->is_lock_supported() ||
-	   ictx->image_watcher->is_lock_owner());
-
-    ldout(ictx->cct, 20) << __func__ << " " << ictx << " from " 
-			 << src_snap_id << " to " << dst_name << dendl;
-
-    int r = ictx_check(ictx, ictx->owner_lock);
-    if (r < 0) {
-      return r;
-    }
-    r = rename_snap(ictx, src_snap_id, dst_name);
-
-    if (r < 0) {
-      return r;
-    }
-
-    if (ctx != NULL) {
-      ctx->complete(0);
+    if ((ictx->features & RBD_FEATURE_JOURNALING) != 0) {
+      assert(ictx->exclusive_lock == nullptr ||
+	     ictx->exclusive_lock->is_lock_owner());
     }
-    return 0;
-  }
+    ldout(ictx->cct, 20) << __func__ << " " << ictx << " from "
+                         << src_snap_id << " to " << dst_name << dendl;
 
-  static int scan_for_parents(ImageCtx *ictx, parent_spec &pspec,
-			      snapid_t oursnap_id)
-  {
-    if (pspec.pool_id != -1) {
-      map<snap_t, SnapInfo>::iterator it;
-      for (it = ictx->snap_info.begin();
-	   it != ictx->snap_info.end(); ++it) {
-	// skip our snap id (if checking base image, CEPH_NOSNAP won't match)
-	if (it->first == oursnap_id)
-	  continue;
-	if (it->second.parent.spec == pspec)
-	  break;
-      }
-      if (it == ictx->snap_info.end())
-	return -ENOENT;
-    }
-    return 0;
+    operation::SnapshotRenameRequest<> *req =
+      new operation::SnapshotRenameRequest<>(*ictx, ctx, src_snap_id, dst_name);
+    req->send();
   }
 
-  int snap_remove(ImageCtx *ictx, const char *snap_name)
+  int snap_protect(ImageCtx *ictx, const char *snap_name)
   {
-    ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_name << dendl;
+    ldout(ictx->cct, 20) << "snap_protect " << ictx << " " << snap_name
+			 << dendl;
 
-    if (ictx->read_only)
+    if (ictx->read_only) {
       return -EROFS;
+    }
 
-    int r = ictx_check(ictx);
-    if (r < 0)
+    int r = ictx->state->refresh_if_required();
+    if (r < 0) {
       return r;
+    }
 
-    bool fast_diff_enabled = false;
     {
       RWLock::RLocker snap_locker(ictx->snap_lock);
-      if (ictx->get_snap_id(snap_name) == CEPH_NOSNAP) {
-        return -ENOENT;
+      bool is_protected;
+      r = ictx->is_snap_protected(ictx->get_snap_id(snap_name), &is_protected);
+      if (r < 0) {
+        return r;
+      }
+
+      if (is_protected) {
+        return -EBUSY;
       }
-      fast_diff_enabled = ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0);
     }
 
-    if (fast_diff_enabled) {
-      r = invoke_async_request(ictx, "snap_remove", true,
-                               boost::bind(&snap_remove_helper, ictx, _1,
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      r = invoke_async_request(ictx, "snap_protect", true,
+                               boost::bind(&snap_protect_helper, ictx, _1,
                                            snap_name),
-                               boost::bind(&ImageWatcher::notify_snap_remove,
+                               boost::bind(&ImageWatcher::notify_snap_protect,
                                            ictx->image_watcher, snap_name));
-      if (r < 0 && r != -EEXIST) {
+      if (r < 0 && r != -EBUSY) {
         return r;
       }
     } else {
       RWLock::RLocker owner_lock(ictx->owner_lock);
-      r = snap_remove_helper(ictx, NULL, snap_name);
+      C_SaferCond cond_ctx;
+      snap_protect_helper(ictx, &cond_ctx, snap_name);
+
+      r = cond_ctx.wait();
       if (r < 0) {
         return r;
       }
     }
 
     notify_change(ictx->md_ctx, ictx->header_oid, ictx);
-
-    ictx->perfcounter->inc(l_librbd_snap_remove);
     return 0;
   }
 
-  int snap_remove_helper(ImageCtx *ictx, Context *ctx, const char *snap_name)
+  void snap_protect_helper(ImageCtx *ictx, Context* ctx, const char *snap_name)
   {
     assert(ictx->owner_lock.is_locked());
-    {
-      if ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0) {
-        assert(!ictx->image_watcher->is_lock_supported() ||
-               ictx->image_watcher->is_lock_owner());
-      }
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      assert(ictx->exclusive_lock == nullptr ||
+	     ictx->exclusive_lock->is_lock_owner());
     }
 
-    ldout(ictx->cct, 20) << "snap_remove_helper " << ictx << " " << snap_name
+    ldout(ictx->cct, 20) << "snap_protect_helper " << ictx << " " << snap_name
                          << dendl;
 
-    int r = ictx_check(ictx, ictx->owner_lock);
+    operation::SnapshotProtectRequest<> *request =
+      new operation::SnapshotProtectRequest<>(*ictx, ctx, snap_name);
+    request->send();
+  }
+
+  int snap_unprotect(ImageCtx *ictx, const char *snap_name)
+  {
+    ldout(ictx->cct, 20) << "snap_unprotect " << ictx << " " << snap_name
+			 << dendl;
+
+    if (ictx->read_only) {
+      return -EROFS;
+    }
+
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
 
-    RWLock::RLocker md_locker(ictx->md_lock);
-    snap_t snap_id;
     {
-      RWLock::WLocker snap_locker(ictx->snap_lock);
-      snap_id = ictx->get_snap_id(snap_name);
-      if (snap_id == CEPH_NOSNAP) {
-        return -ENOENT;
-      }
-
-      r = ictx->object_map.snapshot_remove(snap_id);
+      RWLock::RLocker snap_locker(ictx->snap_lock);
+      bool is_unprotected;
+      r = ictx->is_snap_unprotected(ictx->get_snap_id(snap_name),
+                                    &is_unprotected);
       if (r < 0) {
-        lderr(ictx->cct) << "snap_remove: failed to remove snapshot object map"
-		         << dendl;
         return r;
       }
 
-      {
-        parent_spec our_pspec;
-        RWLock::RLocker parent_locker(ictx->parent_lock);
-        r = ictx->get_parent_spec(snap_id, &our_pspec);
-        if (r < 0) {
-	  lderr(ictx->cct) << "snap_remove: can't get parent spec" << dendl;
-	  return r;
-        }
+      if (is_unprotected) {
+        return -EINVAL;
+      }
+    }
 
-        if (ictx->parent_md.spec != our_pspec &&
-	    (scan_for_parents(ictx, our_pspec, snap_id) == -ENOENT)) {
-          r = cls_client::remove_child(&ictx->md_ctx, RBD_CHILDREN,
-				       our_pspec, ictx->id);
-	  if (r < 0 && r != -ENOENT) {
-            lderr(ictx->cct) << "snap_remove: failed to deregister from parent "
-                             << "image" << dendl;
-	    return r;
-          }
-        }
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      r = invoke_async_request(ictx, "snap_unprotect", true,
+                               boost::bind(&snap_unprotect_helper, ictx, _1,
+                                           snap_name),
+                               boost::bind(&ImageWatcher::notify_snap_unprotect,
+                                           ictx->image_watcher, snap_name));
+      if (r < 0 && r != -EINVAL) {
+        return r;
       }
+    } else {
+      RWLock::RLocker owner_lock(ictx->owner_lock);
+      C_SaferCond cond_ctx;
+      snap_unprotect_helper(ictx, &cond_ctx, snap_name);
 
-      r = rm_snap(ictx, snap_name, snap_id);
+      r = cond_ctx.wait();
       if (r < 0) {
         return r;
       }
     }
 
-    r = ictx->data_ctx.selfmanaged_snap_remove(snap_id);
-    if (r < 0) {
-      lderr(ictx->cct) << "snap_remove: failed to remove RADOS snapshot"
-                       << dendl;
-      return r;
-    }
-
-    if (ctx != NULL) {
-      ctx->complete(0);
-    }
+    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     return 0;
   }
 
-  int snap_protect(ImageCtx *ictx, const char *snap_name)
+  void snap_unprotect_helper(ImageCtx *ictx, Context* ctx,
+                             const char *snap_name)
   {
-    ldout(ictx->cct, 20) << "snap_protect " << ictx << " " << snap_name
-			 << dendl;
-
-    if (ictx->read_only)
-      return -EROFS;
-
-    int r = ictx_check(ictx);
-    if (r < 0)
-      return r;
-
-    RWLock::RLocker l(ictx->md_lock);
-    RWLock::RLocker l2(ictx->snap_lock);
-    if ((ictx->features & RBD_FEATURE_LAYERING) == 0) {
-      lderr(ictx->cct) << "snap_protect: image must support layering"
-		       << dendl;
-      return -ENOSYS;
+    assert(ictx->owner_lock.is_locked());
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      assert(ictx->exclusive_lock == nullptr ||
+	     ictx->exclusive_lock->is_lock_owner());
     }
-    snap_t snap_id = ictx->get_snap_id(snap_name);
-    if (snap_id == CEPH_NOSNAP)
-      return -ENOENT;
 
-    bool is_protected;
-    r = ictx->is_snap_protected(snap_id, &is_protected);
-    if (r < 0)
-      return r;
-
-    if (is_protected)
-      return -EBUSY;
+    ldout(ictx->cct, 20) << "snap_unprotect_helper " << ictx << " " << snap_name
+                         << dendl;
 
-    r = cls_client::set_protection_status(&ictx->md_ctx,
-					  ictx->header_oid,
-					  snap_id,
-					  RBD_PROTECTION_STATUS_PROTECTED);
-    if (r < 0)
-      return r;
-    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
-    return 0;
+    operation::SnapshotUnprotectRequest<> *request =
+      new operation::SnapshotUnprotectRequest<>(*ictx, ctx, snap_name);
+    request->send();
   }
 
-  int snap_unprotect(ImageCtx *ictx, const char *snap_name)
+  int snap_is_protected(ImageCtx *ictx, const char *snap_name,
+			bool *is_protected)
   {
-    ldout(ictx->cct, 20) << "snap_unprotect " << ictx << " " << snap_name
+    ldout(ictx->cct, 20) << "snap_is_protected " << ictx << " " << snap_name
 			 << dendl;
 
-    if (ictx->read_only)
-      return -EROFS;
-
-    int r = ictx_check(ictx);
-    if (r < 0)
-      return r;
-
-    RWLock::RLocker l(ictx->md_lock);
-    RWLock::RLocker l2(ictx->snap_lock);
-    if ((ictx->features & RBD_FEATURE_LAYERING) == 0) {
-      lderr(ictx->cct) << "snap_unprotect: image must support layering"
-		       << dendl;
-      return -ENOSYS;
-    }
-    snap_t snap_id = ictx->get_snap_id(snap_name);
-    if (snap_id == CEPH_NOSNAP)
-      return -ENOENT;
-
-    bool is_unprotected;
-    r = ictx->is_snap_unprotected(snap_id, &is_unprotected);
-    if (r < 0)
-      return r;
-
-    if (is_unprotected) {
-      lderr(ictx->cct) << "snap_unprotect: snapshot is already unprotected"
-		       << dendl;
-      return -EINVAL;
-    }
-
-    r = cls_client::set_protection_status(&ictx->md_ctx,
-					  ictx->header_oid,
-					  snap_id,
-					  RBD_PROTECTION_STATUS_UNPROTECTING);
-    if (r < 0)
-      return r;
-    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
-
-    parent_spec pspec(ictx->md_ctx.get_id(), ictx->id, snap_id);
-    // search all pools for children depending on this snapshot
-    Rados rados(ictx->md_ctx);
-    rados.wait_for_latest_osdmap();
-
-    // protect against pools being renamed/deleted
-    std::list<std::pair<int64_t, std::string> > pools;
-    rados.pool_list2(pools);
-
-    for (std::list<std::pair<int64_t, std::string> >::const_iterator it =
-         pools.begin(); it != pools.end(); ++it) {
-      int64_t base_tier;
-      r = rados.pool_get_base_tier(it->first, &base_tier);
-      if (r == -ENOENT) {
-        ldout(ictx->cct, 1) << "pool " << it->second << " no longer exists"
-                            << dendl;
-        continue;
-      } else if (r < 0) {
-        lderr(ictx->cct) << "snap_unprotect: error retrieving base tier for "
-                         << "pool " << it->second << dendl;
-        goto reprotect_and_return_err;
-      }
-      if (it->first != base_tier) {
-	// pool is a cache; skip it
-	continue;
-      }
-
-      IoCtx pool_ioctx;
-      r = rados.ioctx_create2(it->first, pool_ioctx);
-      if (r == -ENOENT) {
-        ldout(ictx->cct, 1) << "pool " << it->second << " no longer exists"
-                            << dendl;
-        continue;
-      } else if (r < 0) {
-        lderr(ictx->cct) << "snap_unprotect: can't create ioctx for pool "
-        		 << it->second << dendl;
-        goto reprotect_and_return_err;
-      }
-
-      std::set<std::string> children;
-      r = cls_client::get_children(&pool_ioctx, RBD_CHILDREN, pspec, children);
-      // key should not exist for this parent if there is no entry
-      if (((r < 0) && (r != -ENOENT))) {
-        lderr(ictx->cct) << "can't get children for pool " << it->second
-                         << dendl;
-        goto reprotect_and_return_err;
-      }
-      // if we found a child, can't unprotect
-      if (r == 0) {
-        lderr(ictx->cct) << "snap_unprotect: can't unprotect; at least "
-          << children.size() << " child(ren) in pool " << it->second << dendl;
-        r = -EBUSY;
-        goto reprotect_and_return_err;
-      }
-    }
-
-    // didn't find any child in any pool, go ahead with unprotect
-    r = cls_client::set_protection_status(&ictx->md_ctx,
-					  ictx->header_oid,
-					  snap_id,
-					  RBD_PROTECTION_STATUS_UNPROTECTED);
-    if (r < 0) {
-      lderr(ictx->cct) << "snap_unprotect: error setting unprotected status"
-		       << dendl;
-      goto reprotect_and_return_err;
-    }
-    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
-    return 0;
-
-reprotect_and_return_err:
-    int proterr = cls_client::set_protection_status(&ictx->md_ctx,
-						    ictx->header_oid,
-						    snap_id,
-					      RBD_PROTECTION_STATUS_PROTECTED);
-    if (proterr < 0) {
-      lderr(ictx->cct) << "snap_unprotect: can't reprotect image" << dendl;
-    }
-    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
-    return r;
-  }
-
-  int snap_is_protected(ImageCtx *ictx, const char *snap_name,
-			bool *is_protected)
-  {
-    ldout(ictx->cct, 20) << "snap_is_protected " << ictx << " " << snap_name
-			 << dendl;
-
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -1185,8 +1126,14 @@ reprotect_and_return_err:
 		uint64_t size, int order)
   {
     CephContext *cct = (CephContext *)io_ctx.cct();
+
+    int r = validate_pool(io_ctx, cct);
+    if (r < 0) {
+      return r;
+    }
+
     ldout(cct, 2) << "adding rbd image to directory..." << dendl;
-    int r = tmap_set(io_ctx, imgname);
+    r = tmap_set(io_ctx, imgname);
     if (r < 0) {
       lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
 		 << dendl;
@@ -1200,7 +1147,7 @@ reprotect_and_return_err:
     bufferlist bl;
     bl.append((const char *)&header, sizeof(header));
 
-    string header_oid = old_header_name(imgname);
+    string header_oid = util::old_header_name(imgname);
     r = io_ctx.write(header_oid, bl, bl.length(), 0);
     if (r < 0) {
       lderr(cct) << "Error writing image header: " << cpp_strerror(r)
@@ -1220,7 +1167,9 @@ reprotect_and_return_err:
 
   int create_v2(IoCtx& io_ctx, const char *imgname, uint64_t bid, uint64_t size,
 		int order, uint64_t features, uint64_t stripe_unit,
-		uint64_t stripe_count)
+		uint64_t stripe_count, uint8_t journal_order,
+		uint8_t journal_splay_width,
+		const std::string &journal_pool)
   {
     ostringstream bid_ss;
     uint32_t extra;
@@ -1231,9 +1180,14 @@ reprotect_and_return_err:
 
     ceph_file_layout layout;
 
-    id_obj = id_obj_name(imgname);
+    int r = validate_pool(io_ctx, cct);
+    if (r < 0) {
+      return r;
+    }
+
+    id_obj = util::id_obj_name(imgname);
 
-    int r = io_ctx.create(id_obj, true);
+    r = io_ctx.create(id_obj, true);
     if (r < 0) {
       lderr(cct) << "error creating rbd id object: " << cpp_strerror(r)
 		 << dendl;
@@ -1258,7 +1212,7 @@ reprotect_and_return_err:
     }
 
     oss << RBD_DATA_PREFIX << id;
-    header_oid = header_name(id);
+    header_oid = util::header_name(id);
     r = cls_client::create_image(&io_ctx, header_oid, size, order,
 				 features, oss.str());
     if (r < 0) {
@@ -1314,7 +1268,8 @@ reprotect_and_return_err:
         goto err_remove_object_map;
       }
 
-      r = Journal::create(io_ctx, id);
+      r = Journal::create(io_ctx, id, journal_order, journal_splay_width,
+			  journal_pool);
       if (r < 0) {
         lderr(cct) << "error creating journal: " << cpp_strerror(r) << dendl;
         goto err_remove_object_map;
@@ -1484,8 +1439,16 @@ reprotect_and_return_err:
 
       r = create_v1(io_ctx, imgname, bid, size, order);
     } else {
+      uint64_t journal_order = cct->_conf->rbd_journal_order;
+      uint64_t journal_splay_width = cct->_conf->rbd_journal_splay_width;
+      std::string journal_pool = cct->_conf->rbd_journal_pool;
+
+      opts.get(RBD_IMAGE_OPTION_JOURNAL_ORDER, &journal_order);
+      opts.get(RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, &journal_splay_width);
+      opts.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &journal_pool);
+
       r = create_v2(io_ctx, imgname, bid, size, order, features, stripe_unit,
-		    stripe_count);
+		    stripe_count, journal_order, journal_splay_width, journal_pool);
     }
 
     int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
@@ -1498,39 +1461,46 @@ reprotect_and_return_err:
    * Parent may be in different pool, hence different IoCtx
    */
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
-	    IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts)
+	    IoCtx& c_ioctx, const char *c_name,
+	    uint64_t features, int *c_order,
+	    uint64_t stripe_unit, int stripe_count)
   {
-    int order = 0;
-    uint64_t features = 0;
-    uint64_t stripe_unit = 0;
-    uint64_t stripe_count = 0;
-    c_opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
-    c_opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit);
-    c_opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count);
+    uint64_t order = *c_order;
+
+    ImageOptions opts;
+    opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2));
+    opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+    opts.set(RBD_IMAGE_OPTION_ORDER, order);
+    opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+    opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
 
-    int r = clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, features,
-		  &order, stripe_unit, stripe_count);
-    c_opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
+    int r = clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, opts);
+    opts.get(RBD_IMAGE_OPTION_ORDER, &order);
+    *c_order = order;
     return r;
   }
 
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
-	    IoCtx& c_ioctx, const char *c_name,
-	    uint64_t features, int *c_order,
-	    uint64_t stripe_unit, int stripe_count)
+	    IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts)
   {
     CephContext *cct = (CephContext *)p_ioctx.cct();
     ldout(cct, 20) << "clone " << &p_ioctx << " name " << p_name << " snap "
 		   << p_snap_name << "to child " << &c_ioctx << " name "
-		   << c_name << " features = " << features << " order = "
-		   << *c_order
-		   << " stripe_unit = " << stripe_unit
-		   << " stripe_count = " << stripe_count
-		   << dendl;
+		   << c_name << " opts = " << c_opts << dendl;
 
-    if (features & ~RBD_FEATURES_ALL) {
-      lderr(cct) << "librbd does not support requested features" << dendl;
-      return -ENOSYS;
+    uint64_t format = cct->_conf->rbd_default_format;
+    c_opts.get(RBD_IMAGE_OPTION_FORMAT, &format);
+    if (format < 2) {
+      lderr(cct) << "format 2 or later required for clone" << dendl;
+      return -EINVAL;
+    }
+
+    uint64_t features;
+    if (c_opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+      if (features & ~RBD_FEATURES_ALL) {
+	lderr(cct) << "librbd does not support requested features" << dendl;
+	return -ENOSYS;
+      }
     }
 
     // make sure child doesn't already exist, in either format
@@ -1546,7 +1516,7 @@ reprotect_and_return_err:
     }
 
     bool snap_protected;
-    int order;
+    uint64_t order;
     uint64_t size;
     uint64_t p_features;
     int partial_r;
@@ -1555,7 +1525,7 @@ reprotect_and_return_err:
     map<string, bufferlist> pairs;
     // make sure parent snapshot exists
     ImageCtx *p_imctx = new ImageCtx(p_name, "", p_snap_name, p_ioctx, true);
-    r = open_image(p_imctx);
+    r = p_imctx->state->open();
     if (r < 0) {
       lderr(cct) << "error opening parent image: "
 		 << cpp_strerror(-r) << dendl;
@@ -1589,19 +1559,19 @@ reprotect_and_return_err:
       goto err_close_parent;
     }
 
-    order = *c_order;
-    if (!order)
-      order = p_imctx->order;
+    order = p_imctx->order;
+    if (c_opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+      c_opts.set(RBD_IMAGE_OPTION_ORDER, order);
+    }
 
-    r = create(c_ioctx, c_name, size, false, features, &order,
-	       stripe_unit, stripe_count);
+    r = create(c_ioctx, c_name, size, c_opts);
     if (r < 0) {
       lderr(cct) << "error creating child: " << cpp_strerror(r) << dendl;
       goto err_close_parent;
     }
 
     c_imctx = new ImageCtx(c_name, "", NULL, c_ioctx, false);
-    r = open_image(c_imctx);
+    r = c_imctx->state->open();
     if (r < 0) {
       lderr(cct) << "Error opening new image: " << cpp_strerror(r) << dendl;
       goto err_remove;
@@ -1609,32 +1579,17 @@ reprotect_and_return_err:
 
     r = cls_client::set_parent(&c_ioctx, c_imctx->header_oid, pspec, size);
     if (r < 0) {
-      lderr(cct) << "couldn't set parent: " << r << dendl;
+      lderr(cct) << "couldn't set parent: " << cpp_strerror(r) << dendl;
       goto err_close_child;
     }
 
     r = cls_client::add_child(&c_ioctx, RBD_CHILDREN, pspec, c_imctx->id);
     if (r < 0) {
-      lderr(cct) << "couldn't add child: " << r << dendl;
-      goto err_close_child;
-    }
-
-    r = cls_client::metadata_list(&p_ioctx, p_imctx->header_oid, "", 0, &pairs);
-    if (r < 0 && r != -EOPNOTSUPP && r != -EIO) {
-      lderr(cct) << "couldn't list metadata: " << r << dendl;
+      lderr(cct) << "couldn't add child: " << cpp_strerror(r) << dendl;
       goto err_close_child;
-    } else if (r == 0 && !pairs.empty()) {
-      r = cls_client::metadata_set(&c_ioctx, c_imctx->header_oid, pairs);
-      if (r < 0) {
-        lderr(cct) << "couldn't set metadata: " << r << dendl;
-        goto err_close_child;
-      }
     }
 
-    {
-      RWLock::RLocker owner_locker(p_imctx->owner_lock);
-      r = ictx_refresh(p_imctx);
-    }
+    r = p_imctx->state->refresh();
     if (r == 0) {
       p_imctx->snap_lock.get_read();
       r = p_imctx->is_snap_protected(p_imctx->snap_id, &snap_protected);
@@ -1646,9 +1601,22 @@ reprotect_and_return_err:
       goto err_remove_child;
     }
 
+    r = cls_client::metadata_list(&p_ioctx, p_imctx->header_oid, "", 0, &pairs);
+    if (r < 0 && r != -EOPNOTSUPP && r != -EIO) {
+      lderr(cct) << "couldn't list metadata: " << r << dendl;
+      goto err_remove_child;
+    } else if (r == 0 && !pairs.empty()) {
+      r = cls_client::metadata_set(&c_ioctx, c_imctx->header_oid, pairs);
+      if (r < 0) {
+        lderr(cct) << "couldn't set metadata: " << cpp_strerror(r) << dendl;
+        goto err_remove_child;
+      }
+    }
+
     ldout(cct, 2) << "done." << dendl;
-    r = close_image(c_imctx);
-    partial_r = close_image(p_imctx);
+    r = c_imctx->state->close();
+    partial_r = p_imctx->state->close();
+
     if (r == 0 && partial_r < 0) {
       r = partial_r;
     }
@@ -1662,7 +1630,7 @@ reprotect_and_return_err:
                 << cpp_strerror(partial_r) << dendl;
     }
   err_close_child:
-    close_image(c_imctx);
+    c_imctx->state->close();
   err_remove:
     partial_r = remove(c_ioctx, c_name, no_op);
     if (partial_r < 0) {
@@ -1670,7 +1638,7 @@ reprotect_and_return_err:
 		 << cpp_strerror(partial_r) << dendl;
     }
   err_close_parent:
-    close_image(p_imctx);
+    p_imctx->state->close();
     return r;
   }
 
@@ -1680,13 +1648,16 @@ reprotect_and_return_err:
     ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
 		   << dstname << dendl;
 
-    bool old_format;
-    uint64_t src_size;
-    int r = detect_format(io_ctx, srcname, &old_format, &src_size);
+    ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
+    int r = ictx->state->open();
     if (r < 0) {
-      lderr(cct) << "error finding source object: " << cpp_strerror(r) << dendl;
+      lderr(ictx->cct) << "error opening source image: " << cpp_strerror(r)
+		       << dendl;
       return r;
     }
+    BOOST_SCOPE_EXIT((ictx)) {
+      ictx->state->close();
+    } BOOST_SCOPE_EXIT_END
 
     r = detect_format(io_ctx, dstname, NULL, NULL);
     if (r < 0 && r != -ENOENT) {
@@ -1699,97 +1670,53 @@ reprotect_and_return_err:
       return -EEXIST;
     }
 
-    string src_oid =
-      old_format ? old_header_name(srcname) : id_obj_name(srcname);
-    string dst_oid =
-      old_format ? old_header_name(dstname) : id_obj_name(dstname);
-
-    string id;
-    if (!old_format) {
-      r = cls_client::get_id(&io_ctx, src_oid, &id);
-      if (r < 0) {
-	lderr(cct) << "error reading image id: " << cpp_strerror(r) << dendl;
-	return r;
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      r = invoke_async_request(ictx, "rename", true,
+                               boost::bind(&rename_helper, ictx, _1,
+                                           dstname),
+                               boost::bind(&ImageWatcher::notify_rename,
+                                           ictx->image_watcher, dstname));
+      if (r < 0 && r != -EEXIST) {
+        return r;
       }
-    }
-
-    bufferlist databl;
-    map<string, bufferlist> omap_values;
-    r = io_ctx.read(src_oid, databl, src_size, 0);
-    if (r < 0) {
-      lderr(cct) << "error reading source object: " << src_oid << ": "
-		 << cpp_strerror(r) << dendl;
-      return r;
-    }
+    } else {
+      RWLock::RLocker owner_lock(ictx->owner_lock);
+      C_SaferCond cond_ctx;
+      rename_helper(ictx, &cond_ctx, dstname);
 
-    int MAX_READ = 1024;
-    string last_read = "";
-    do {
-      map<string, bufferlist> outbl;
-      r = io_ctx.omap_get_vals(src_oid, last_read, MAX_READ, &outbl);
+      r = cond_ctx.wait();
       if (r < 0) {
-	lderr(cct) << "error reading source object omap values: "
-		   << cpp_strerror(r) << dendl;
-	return r;
+        return r;
       }
-      omap_values.insert(outbl.begin(), outbl.end());
-      if (!outbl.empty())
-	last_read = outbl.rbegin()->first;
-    } while (r == MAX_READ);
-
-    librados::ObjectWriteOperation op;
-    op.create(true);
-    op.write_full(databl);
-    if (!omap_values.empty())
-      op.omap_set(omap_values);
-    r = io_ctx.operate(dst_oid, &op);
-    if (r < 0) {
-      lderr(cct) << "error writing destination object: " << dst_oid << ": "
-		 << cpp_strerror(r) << dendl;
-      return r;
     }
 
-    if (old_format) {
-      r = tmap_set(io_ctx, dstname);
-      if (r < 0) {
-	io_ctx.remove(dst_oid);
-	lderr(cct) << "couldn't add " << dstname << " to directory: "
-		   << cpp_strerror(r) << dendl;
-	return r;
-      }
-      r = tmap_rm(io_ctx, srcname);
-      if (r < 0) {
-	lderr(cct) << "warning: couldn't remove old entry from directory ("
-		   << srcname << ")" << dendl;
-      }
-    } else {
-      r = cls_client::dir_rename_image(&io_ctx, RBD_DIRECTORY,
-				       srcname, dstname, id);
-      if (r < 0) {
-	lderr(cct) << "error updating directory: " << cpp_strerror(r) << dendl;
-	return r;
-      }
+    if (ictx->old_format) {
+      notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     }
+    return 0;
+  }
 
-    r = io_ctx.remove(src_oid);
-    if (r < 0 && r != -ENOENT) {
-      lderr(cct) << "warning: couldn't remove old source object ("
-		 << src_oid << ")" << dendl;
+  void rename_helper(ImageCtx *ictx, Context *ctx, const char *dstname)
+  {
+    assert(ictx->owner_lock.is_locked());
+    if (ictx->test_features(RBD_FEATURE_JOURNALING)) {
+      assert(ictx->exclusive_lock == nullptr ||
+	     ictx->exclusive_lock->is_lock_owner());
     }
 
-    if (old_format) {
-      notify_change(io_ctx, old_header_name(srcname), NULL);
-    }
+    ldout(ictx->cct, 20) << "rename_helper " << ictx << " " << dstname
+                         << dendl;
 
-    return 0;
+    operation::RenameRequest<> *req =
+      new operation::RenameRequest<>(*ictx, ctx, dstname);
+    req->send();
   }
 
-
   int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
   {
     ldout(ictx->cct, 20) << "info " << ictx << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -1799,7 +1726,7 @@ reprotect_and_return_err:
 
   int get_old_format(ImageCtx *ictx, uint8_t *old)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
     *old = ictx->old_format;
@@ -1808,7 +1735,7 @@ reprotect_and_return_err:
 
   int get_size(ImageCtx *ictx, uint64_t *size)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
     RWLock::RLocker l2(ictx->snap_lock);
@@ -1818,7 +1745,7 @@ reprotect_and_return_err:
 
   int get_features(ImageCtx *ictx, uint64_t *features)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
     RWLock::RLocker l(ictx->snap_lock);
@@ -1828,7 +1755,7 @@ reprotect_and_return_err:
 
   int update_features(ImageCtx *ictx, uint64_t features, bool enabled)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -1841,144 +1768,148 @@ reprotect_and_return_err:
       return -EINVAL;
     }
 
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::WLocker md_locker(ictx->md_lock);
-    r = ictx->flush();
-    if (r < 0) {
-      return r;
-    }
+    {
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      RWLock::WLocker md_locker(ictx->md_lock);
+      r = ictx->flush();
+      if (r < 0) {
+        return r;
+      }
 
-    if ((features & RBD_FEATURES_MUTABLE) != features) {
-      lderr(cct) << "cannot update immutable features" << dendl;
-      return -EINVAL;
-    } else if (features == 0) {
-      lderr(cct) << "update requires at least one feature" << dendl;
-      return -EINVAL;
-    }
+      if ((features & RBD_FEATURES_MUTABLE) != features) {
+        lderr(cct) << "cannot update immutable features" << dendl;
+        return -EINVAL;
+      } else if (features == 0) {
+        lderr(cct) << "update requires at least one feature" << dendl;
+        return -EINVAL;
+      }
 
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    uint64_t new_features;
-    if (enabled) {
-      features &= ~ictx->features;
-      new_features = ictx->features | features;
-    } else {
-      features &= ictx->features;
-      new_features = ictx->features & ~features;
-    }
+      RWLock::WLocker snap_locker(ictx->snap_lock);
+      uint64_t new_features;
+      if (enabled) {
+        features &= ~ictx->features;
+        new_features = ictx->features | features;
+      } else {
+        features &= ictx->features;
+        new_features = ictx->features & ~features;
+      }
 
-    if (features == 0) {
-      return 0;
-    }
+      if (features == 0) {
+        return 0;
+      }
 
-    uint64_t features_mask = features;
-    uint64_t disable_flags = 0;
-    if (enabled) {
-      uint64_t enable_flags = 0;
+      uint64_t features_mask = features;
+      uint64_t disable_flags = 0;
+      if (enabled) {
+        uint64_t enable_flags = 0;
 
-      if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
-        if ((new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
-          lderr(cct) << "cannot enable object map" << dendl;
-          return -EINVAL;
+        if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
+          if ((new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+            lderr(cct) << "cannot enable object map" << dendl;
+            return -EINVAL;
+          }
+          enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+          features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
         }
-        enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
-        features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
-      }
-      if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
-        if ((new_features & RBD_FEATURE_OBJECT_MAP) == 0) {
-          lderr(cct) << "cannot enable fast diff" << dendl;
-          return -EINVAL;
+        if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
+          if ((new_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+            lderr(cct) << "cannot enable fast diff" << dendl;
+            return -EINVAL;
+          }
+          enable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+          features_mask |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_EXCLUSIVE_LOCK);
         }
-        enable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
-        features_mask |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_EXCLUSIVE_LOCK);
-      }
-      if ((features & RBD_FEATURE_JOURNALING) != 0) {
-        if ((new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
-          lderr(cct) << "cannot enable journaling" << dendl;
-          return -EINVAL;
+        if ((features & RBD_FEATURE_JOURNALING) != 0) {
+          if ((new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+            lderr(cct) << "cannot enable journaling" << dendl;
+            return -EINVAL;
+          }
+          features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
+
+          r = Journal::create(ictx->md_ctx, ictx->id, ictx->journal_order,
+  			    ictx->journal_splay_width,
+  			    ictx->journal_pool);
+          if (r < 0) {
+            lderr(cct) << "error creating image journal: " << cpp_strerror(r)
+                       << dendl;
+            return r;
+          }
         }
-        features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
 
-        r = Journal::create(ictx->md_ctx, ictx->id);
-        if (r < 0) {
-          lderr(cct) << "error creating image journal: " << cpp_strerror(r)
-                     << dendl;
-          return r;
+        if (enable_flags != 0) {
+          r = update_all_flags(ictx, enable_flags, enable_flags);
+          if (r < 0) {
+            return r;
+          }
         }
-      }
+      } else {
+        if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
+          if ((new_features & RBD_FEATURE_OBJECT_MAP) != 0 ||
+              (new_features & RBD_FEATURE_JOURNALING) != 0) {
+            lderr(cct) << "cannot disable exclusive lock" << dendl;
+            return -EINVAL;
+          }
+          features_mask |= RBD_FEATURE_OBJECT_MAP;
+        }
+        if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
+          if ((new_features & RBD_FEATURE_FAST_DIFF) != 0) {
+            lderr(cct) << "cannot disable object map" << dendl;
+            return -EINVAL;
+          }
 
-      if (enable_flags != 0) {
-        r = update_all_flags(ictx, enable_flags, enable_flags);
-        if (r < 0) {
-          return r;
+          disable_flags = RBD_FLAG_OBJECT_MAP_INVALID;
+          r = remove_object_map(ictx);
+          if (r < 0) {
+            lderr(cct) << "failed to remove object map" << dendl;
+            return r;
+          }
         }
-      }
-    } else {
-      if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
-        if ((new_features & RBD_FEATURE_OBJECT_MAP) != 0 ||
-            (new_features & RBD_FEATURE_JOURNALING) != 0) {
-          lderr(cct) << "cannot disable exclusive lock" << dendl;
-          return -EINVAL;
+        if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
+          disable_flags = RBD_FLAG_FAST_DIFF_INVALID;
         }
-        features_mask |= RBD_FEATURE_OBJECT_MAP;
-      }
-      if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
-        if ((new_features & RBD_FEATURE_FAST_DIFF) != 0) {
-          lderr(cct) << "cannot disable object map" << dendl;
-          return -EINVAL;
+        if ((features & RBD_FEATURE_JOURNALING) != 0) {
+          r = Journal::remove(ictx->md_ctx, ictx->id);
+          if (r < 0) {
+            lderr(cct) << "error removing image journal: " << cpp_strerror(r)
+                       << dendl;
+            return r;
+          }
         }
+      }
 
-        disable_flags = RBD_FLAG_OBJECT_MAP_INVALID;
-        r = remove_object_map(ictx);
+      ldout(cct, 10) << "update_features: features=" << new_features << ", "
+                     << "mask=" << features_mask << dendl;
+      r = librbd::cls_client::set_features(&ictx->md_ctx, ictx->header_oid,
+                                           new_features, features_mask);
+      if (r < 0) {
+        lderr(cct) << "failed to update features: " << cpp_strerror(r)
+                   << dendl;
+        return r;
+      }
+      if (((ictx->features & RBD_FEATURE_OBJECT_MAP) == 0) &&
+        ((features & RBD_FEATURE_OBJECT_MAP) != 0)) {
+        r = create_object_map(ictx);
         if (r < 0) {
-          lderr(cct) << "failed to remove object map" << dendl;
+          lderr(cct) << "failed to create object map" << dendl;
           return r;
         }
       }
-      if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
-        disable_flags = RBD_FLAG_FAST_DIFF_INVALID;
-      }
-      if ((features & RBD_FEATURE_JOURNALING) != 0) {
-        r = Journal::remove(ictx->md_ctx, ictx->id);
+
+      if (disable_flags != 0) {
+        r = update_all_flags(ictx, 0, disable_flags);
         if (r < 0) {
-          lderr(cct) << "error removing image journal: " << cpp_strerror(r)
-                     << dendl;
           return r;
         }
       }
     }
 
-    ldout(cct, 10) << "update_features: features=" << new_features << ", mask="
-                   << features_mask << dendl;
-    r = librbd::cls_client::set_features(&ictx->md_ctx, ictx->header_oid,
-                                         new_features, features_mask);
-    if (r < 0) {
-      lderr(cct) << "failed to update features: " << cpp_strerror(r)
-                 << dendl;
-      return r;
-    }
-    if (((ictx->features & RBD_FEATURE_OBJECT_MAP) == 0) &&
-      ((features & RBD_FEATURE_OBJECT_MAP) != 0)) {
-      r = create_object_map(ictx);
-      if (r < 0) {
-        lderr(cct) << "failed to create object map" << dendl;
-        return r;
-      }
-    }
-
-    if (disable_flags != 0) {
-      r = update_all_flags(ictx, 0, disable_flags);
-      if (r < 0) {
-        return r;
-      }
-    }
-
     notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     return 0;
   }
 
   int get_overlap(ImageCtx *ictx, uint64_t *overlap)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
     RWLock::RLocker l(ictx->snap_lock);
@@ -1986,89 +1917,10 @@ reprotect_and_return_err:
     return ictx->get_parent_overlap(ictx->snap_id, overlap);
   }
 
-  int open_parent(ImageCtx *ictx)
-  {
-    assert(ictx->cache_lock.is_locked());
-    assert(ictx->snap_lock.is_wlocked());
-    assert(ictx->parent_lock.is_wlocked());
-
-    string pool_name;
-    Rados rados(ictx->md_ctx);
-
-    int64_t pool_id = ictx->get_parent_pool_id(ictx->snap_id);
-    string parent_image_id = ictx->get_parent_image_id(ictx->snap_id);
-    snap_t parent_snap_id = ictx->get_parent_snap_id(ictx->snap_id);
-    assert(parent_snap_id != CEPH_NOSNAP);
-
-    if (pool_id < 0)
-      return -ENOENT;
-    int r = rados.pool_reverse_lookup(pool_id, &pool_name);
-    if (r < 0) {
-      lderr(ictx->cct) << "error looking up name for pool id " << pool_id
-		       << ": " << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    IoCtx p_ioctx;
-    r = rados.ioctx_create(pool_name.c_str(), p_ioctx);
-    if (r < 0) {
-      lderr(ictx->cct) << "error opening pool " << pool_name << ": "
-		       << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    // since we don't know the image and snapshot name, set their ids and
-    // reset the snap_name and snap_exists fields after we read the header
-    ictx->parent = new ImageCtx("", parent_image_id, NULL, p_ioctx, true);
-
-    // set rados flags for reading the parent image
-    if (ictx->balance_parent_reads)
-      ictx->parent->set_read_flag(librados::OPERATION_BALANCE_READS);
-    else if (ictx->localize_parent_reads)
-      ictx->parent->set_read_flag(librados::OPERATION_LOCALIZE_READS);
-
-    r = open_image(ictx->parent);
-    if (r < 0) {
-      lderr(ictx->cct) << "error opening parent image: " << cpp_strerror(r)
-		       << dendl;
-      ictx->parent = NULL;
-      return r;
-    }
-
-    ictx->parent->cache_lock.Lock();
-    ictx->parent->snap_lock.get_write();
-    r = ictx->parent->get_snap_name(parent_snap_id, &ictx->parent->snap_name);
-    if (r < 0) {
-      lderr(ictx->cct) << "parent snapshot does not exist" << dendl;
-      ictx->parent->snap_lock.put_write();
-      ictx->parent->cache_lock.Unlock();
-      close_parent(ictx);
-      return r;
-    }
-    ictx->parent->snap_set(ictx->parent->snap_name);
-    ictx->parent->parent_lock.get_write();
-    r = refresh_parent(ictx->parent);
-    if (r < 0) {
-      lderr(ictx->cct) << "error refreshing parent snapshot "
-		       << ictx->parent->id << " "
-		       << ictx->parent->snap_name << dendl;
-      ictx->parent->parent_lock.put_write();
-      ictx->parent->snap_lock.put_write();
-      ictx->parent->cache_lock.Unlock();
-      close_parent(ictx);
-      return r;
-    }
-    ictx->parent->parent_lock.put_write();
-    ictx->parent->snap_lock.put_write();
-    ictx->parent->cache_lock.Unlock();
-
-    return 0;
-  }
-
   int get_parent_info(ImageCtx *ictx, string *parent_pool_name,
 		      string *parent_name, string *parent_snap_name)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -2130,7 +1982,7 @@ reprotect_and_return_err:
 
   int get_flags(ImageCtx *ictx, uint64_t *flags)
   {
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -2139,11 +1991,26 @@ reprotect_and_return_err:
     return ictx->get_flags(ictx->snap_id, flags);
   }
 
+  int set_image_notification(ImageCtx *ictx, int fd, int type)
+  {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
+
+    int r = ictx->state->refresh_if_required();
+    if (r < 0) {
+      return r;
+    }
+
+    if (ictx->event_socket.is_valid())
+      return -EINVAL;
+    return ictx->event_socket.init(fd, type);
+  }
+
   int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
   {
     RWLock::RLocker l(ictx->owner_lock);
-    *is_owner = (ictx->image_watcher != NULL &&
-		 ictx->image_watcher->is_lock_owner());
+    *is_owner = (ictx->exclusive_lock != nullptr &&
+		 ictx->exclusive_lock->is_lock_owner());
     return 0;
   }
 
@@ -2156,7 +2023,7 @@ reprotect_and_return_err:
     bool old_format = false;
     bool unknown_format = true;
     ImageCtx *ictx = new ImageCtx(imgname, "", NULL, io_ctx, false);
-    int r = open_image(ictx);
+    int r = ictx->state->open();
     if (r < 0) {
       ldout(cct, 2) << "error opening image: " << cpp_strerror(-r) << dendl;
     } else {
@@ -2166,12 +2033,12 @@ reprotect_and_return_err:
       id = ictx->id;
 
       ictx->owner_lock.get_read();
-      if (ictx->image_watcher->is_lock_supported()) {
+      if (ictx->exclusive_lock != nullptr) {
         r = prepare_image_update(ictx);
-        if (r < 0 || !ictx->image_watcher->is_lock_owner()) {
+        if (r < 0 || !ictx->exclusive_lock->is_lock_owner()) {
 	  lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl;
 	  ictx->owner_lock.put_read();
-	  close_image(ictx);
+	  ictx->state->close();
           return -EBUSY;
         }
       }
@@ -2179,7 +2046,7 @@ reprotect_and_return_err:
       if (ictx->snaps.size()) {
 	lderr(cct) << "image has snapshots - not removing" << dendl;
 	ictx->owner_lock.put_read();
-	close_image(ictx);
+	ictx->state->close();
 	return -ENOTEMPTY;
       }
 
@@ -2188,13 +2055,13 @@ reprotect_and_return_err:
       if (r < 0) {
         lderr(cct) << "error listing watchers" << dendl;
 	ictx->owner_lock.put_read();
-        close_image(ictx);
+        ictx->state->close();
         return r;
       }
       if (watchers.size() > 1) {
         lderr(cct) << "image has watchers - not removing" << dendl;
 	ictx->owner_lock.put_read();
-        close_image(ictx);
+        ictx->state->close();
         return -EBUSY;
       }
 
@@ -2210,12 +2077,12 @@ reprotect_and_return_err:
       if (r < 0 && r != -ENOENT) {
 	lderr(cct) << "error removing child from children list" << dendl;
 	ictx->owner_lock.put_read();
-        close_image(ictx);
+        ictx->state->close();
 	return r;
       }
 
       ictx->owner_lock.put_read();
-      close_image(ictx);
+      ictx->state->close();
 
       ldout(cct, 2) << "removing header..." << dendl;
       r = io_ctx.remove(header_oid);
@@ -2230,8 +2097,10 @@ reprotect_and_return_err:
       r = tmap_rm(io_ctx, imgname);
       old_format = (r == 0);
       if (r < 0 && !unknown_format) {
-	lderr(cct) << "error removing img from old-style directory: "
-		   << cpp_strerror(-r) << dendl;
+        if (r != -ENOENT) {
+	  lderr(cct) << "error removing img from old-style directory: "
+		     << cpp_strerror(-r) << dendl;
+        }
 	return r;
       }
     }
@@ -2249,7 +2118,7 @@ reprotect_and_return_err:
       }
 
       ldout(cct, 2) << "removing id object..." << dendl;
-      r = io_ctx.remove(id_obj_name(imgname));
+      r = io_ctx.remove(util::id_obj_name(imgname));
       if (r < 0 && r != -ENOENT) {
 	lderr(cct) << "error removing id object: " << cpp_strerror(r) << dendl;
 	return r;
@@ -2264,11 +2133,13 @@ reprotect_and_return_err:
       ldout(cct, 2) << "removing rbd image from directory..." << dendl;
       r = cls_client::dir_remove_image(&io_ctx, RBD_DIRECTORY, imgname, id);
       if (r < 0) {
-	lderr(cct) << "error removing img from new-style directory: "
-		   << cpp_strerror(-r) << dendl;
+        if (r != -ENOENT) {
+	  lderr(cct) << "error removing img from new-style directory: "
+		     << cpp_strerror(-r) << dendl;
+        }
 	return r;
       }
-    } 
+    }
 
     ldout(cct, 2) << "done." << dendl;
     return 0;
@@ -2283,7 +2154,7 @@ reprotect_and_return_err:
 		   << size << dendl;
     ictx->snap_lock.put_read();
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -2302,12 +2173,12 @@ reprotect_and_return_err:
     return r;
   }
 
-  int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size,
-		   ProgressContext &prog_ctx)
+  void async_resize(ImageCtx *ictx, Context *ctx, uint64_t size,
+                    ProgressContext &prog_ctx)
   {
     assert(ictx->owner_lock.is_locked());
-    assert(!ictx->image_watcher->is_lock_supported() ||
-	   ictx->image_watcher->is_lock_owner());
+    assert(ictx->exclusive_lock == nullptr ||
+	   ictx->exclusive_lock->is_lock_owner());
 
     CephContext *cct = ictx->cct;
     ictx->snap_lock.get_read();
@@ -2315,28 +2186,16 @@ reprotect_and_return_err:
 		   << size << dendl;
     ictx->snap_lock.put_read();
 
-    int r = ictx_check(ictx, ictx->owner_lock);
-    if (r < 0) {
-      return r;
-    }
-
     {
       RWLock::RLocker snap_locker(ictx->snap_lock);
       if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
-        return -EROFS;
+        ctx->complete(-EROFS);
+        return;
       }
     }
 
-    async_resize_helper(ictx, ctx, size, prog_ctx);
-    return 0;
-  }
-
-  void async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t new_size,
-                           ProgressContext& prog_ctx)
-  {
-    assert(ictx->owner_lock.is_locked());
-    AsyncResizeRequest *req = new AsyncResizeRequest(*ictx, ctx, new_size,
-                                                     prog_ctx);
+    operation::ResizeRequest<> *req = new operation::ResizeRequest<>(
+      *ictx, ctx, size, prog_ctx);
     req->send();
   }
 
@@ -2344,7 +2203,7 @@ reprotect_and_return_err:
   {
     ldout(ictx->cct, 20) << "snap_list " << ictx << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -2365,7 +2224,7 @@ reprotect_and_return_err:
   {
     ldout(ictx->cct, 20) << "snap_exists " << ictx << " " << snap_name << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -2373,417 +2232,6 @@ reprotect_and_return_err:
     return ictx->get_snap_id(snap_name) != CEPH_NOSNAP;
   }
 
-
-  int add_snap(ImageCtx *ictx, const char *snap_name)
-  {
-    assert(ictx->owner_lock.is_locked());
-    assert(ictx->md_lock.is_wlocked());
-
-    bool lock_owner = ictx->image_watcher->is_lock_owner();
-    if (ictx->image_watcher->is_lock_supported()) {
-      assert(lock_owner);
-    }
-
-    uint64_t snap_id;
-    int r = ictx->md_ctx.selfmanaged_snap_create(&snap_id);
-    if (r < 0) {
-      lderr(ictx->cct) << "failed to create snap id: " << cpp_strerror(-r)
-		       << dendl;
-      return r;
-    }
-
-    if (ictx->old_format) {
-      r = cls_client::old_snapshot_add(&ictx->md_ctx, ictx->header_oid,
-				       snap_id, snap_name);
-    } else {
-      librados::ObjectWriteOperation op;
-      if (lock_owner) {
-	ictx->image_watcher->assert_header_locked(&op);
-      }
-      cls_client::snapshot_add(&op, snap_id, snap_name);
-      r = ictx->md_ctx.operate(ictx->header_oid, &op);
-    }
-
-    if (r < 0) {
-      lderr(ictx->cct) << "adding snapshot to header failed: "
-		       << cpp_strerror(r) << dendl;
-      ictx->data_ctx.selfmanaged_snap_remove(snap_id);
-      return r;
-    }
-
-    RWLock::WLocker l(ictx->snap_lock);
-    if (!ictx->old_format) {
-      ictx->object_map.snapshot_add(snap_id);
-      if (lock_owner) {
-	// immediately start using the new snap context if we
-	// own the exclusive lock
-	std::vector<snapid_t> snaps;
-	snaps.push_back(snap_id);
-	snaps.insert(snaps.end(), ictx->snapc.snaps.begin(),
-		     ictx->snapc.snaps.end());
-
-	ictx->snapc.seq = snap_id;
-	ictx->snapc.snaps.swap(snaps);
-	ictx->data_ctx.selfmanaged_snap_set_write_ctx(ictx->snapc.seq,
-						      ictx->snaps);
-      }
-    }
-    return 0;
-  }
-
-  int rm_snap(ImageCtx *ictx, const char *snap_name, uint64_t snap_id)
-  {
-    assert(ictx->snap_lock.is_wlocked());
-
-    int r;
-    if (ictx->old_format) {
-      r = cls_client::old_snapshot_remove(&ictx->md_ctx,
-					  ictx->header_oid, snap_name);
-    } else {
-      r = cls_client::snapshot_remove(&ictx->md_ctx, ictx->header_oid, snap_id);
-      if (r == 0) {
-        ictx->rm_snap(snap_name, snap_id);
-      }
-    }
-
-    if (r < 0) {
-      lderr(ictx->cct) << "removing snapshot from header failed: "
-		       << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    return 0;
-  }
-  int rename_snap(ImageCtx *ictx, uint64_t src_snap_id, const char *dst_name)
-  {
-    assert(ictx->owner_lock.is_locked());
-
-    int r;
-    map<snap_t, SnapInfo>::iterator it;
-    {
-      RWLock::RLocker(ictx->snap_lock);
-      it = ictx->snap_info.find(src_snap_id);
-      if (it == ictx->snap_info.end()) {
-        ldout(ictx->cct, 20) << __func__ << " can not find snap with snap id "
-                             << src_snap_id << dendl;
-        return -ENOENT;
-      }
-    }
-    bool lock_owner = ictx->image_watcher->is_lock_owner();
-    if (ictx->image_watcher->is_lock_supported()) {
-      assert(lock_owner);
-    }
-
-
-    if (ictx->old_format) {
-      r = cls_client::old_snapshot_rename(&ictx->md_ctx, ictx->header_oid,
-				       src_snap_id, dst_name);
-    } else {
-      librados::ObjectWriteOperation op;
-      if (lock_owner) {
-	ictx->image_watcher->assert_header_locked(&op);
-      }
-      cls_client::snapshot_rename(&op, src_snap_id, dst_name);
-      r = ictx->md_ctx.operate(ictx->header_oid, &op);
-    }
-
-    if (r < 0) {
-      lderr(ictx->cct) << "rename snapshot name failed: "
-		       << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    if (!ictx->old_format) {
-      if (lock_owner) {
-        it = ictx->snap_info.find(src_snap_id);
-        if (it == ictx->snap_info.end())
-          return -ENOENT;
-        ictx->snap_ids.erase(it->second.name);
-        it->second.name = dst_name;
-        ictx->snap_ids.insert(make_pair(dst_name,it->first));
-        if (ictx->snap_id == src_snap_id)
-          ictx->snap_name = it->second.name;
-      }
-    }
-    return 0;
-  }
-
-  int ictx_check(ImageCtx *ictx) {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    return ictx_check(ictx, ictx->owner_lock);
-  }
-
-  int ictx_check(ImageCtx *ictx, const RWLock &owner_lock)
-  {
-    assert(ictx->owner_lock.is_locked());
-    CephContext *cct = ictx->cct;
-    ldout(cct, 20) << "ictx_check " << ictx << dendl;
-
-    bool needs_refresh = false;
-    int refresh_seq;
-    {
-      Mutex::Locker refresh_locker(ictx->refresh_lock);
-      while (ictx->refresh_in_progress) {
-        ictx->refresh_cond.Wait(ictx->refresh_lock);
-      }
-
-      if (ictx->last_refresh != ictx->refresh_seq) {
-        ictx->refresh_in_progress = true;
-        needs_refresh = true;
-        refresh_seq = ictx->refresh_seq;
-      }
-    }
-
-    if (needs_refresh) {
-      int r = ictx_refresh(ictx);
-
-      Mutex::Locker refresh_locker(ictx->refresh_lock);
-      ictx->refresh_in_progress = false;
-      ictx->refresh_cond.Signal();
-
-      if (r < 0) {
-	lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r)
-		   << dendl;
-        return r;
-      }
-      ictx->last_refresh = refresh_seq;
-    }
-    return 0;
-  }
-
-  int refresh_parent(ImageCtx *ictx) {
-    assert(ictx->cache_lock.is_locked());
-    assert(ictx->snap_lock.is_wlocked());
-    assert(ictx->parent_lock.is_wlocked());
-
-    // close the parent if it changed or this image no longer needs
-    // to read from it
-    int r;
-    if (ictx->parent) {
-      uint64_t overlap;
-      r = ictx->get_parent_overlap(ictx->snap_id, &overlap);
-      if (r < 0 && r != -ENOENT) {
-	return r;
-      }
-      if (r == -ENOENT || overlap == 0 ||
-	  ictx->parent->md_ctx.get_id() !=
-            ictx->get_parent_pool_id(ictx->snap_id) ||
-	  ictx->parent->id != ictx->get_parent_image_id(ictx->snap_id) ||
-	  ictx->parent->snap_id != ictx->get_parent_snap_id(ictx->snap_id)) {
-	ictx->clear_nonexistence_cache();
-	close_parent(ictx);
-      }
-    }
-
-    if (ictx->get_parent_pool_id(ictx->snap_id) > -1 && !ictx->parent) {
-      r = open_parent(ictx);
-      if (r < 0) {
-	lderr(ictx->cct) << "error opening parent snapshot: "
-			 << cpp_strerror(r) << dendl;
-	return r;
-      }
-    }
-
-    return 0;
-  }
-
-  int ictx_refresh(ImageCtx *ictx)
-  {
-    assert(ictx->owner_lock.is_locked());
-    RWLock::WLocker md_locker(ictx->md_lock);
-
-    CephContext *cct = ictx->cct;
-
-    ldout(cct, 20) << "ictx_refresh " << ictx << dendl;
-
-    ::SnapContext new_snapc;
-    bool new_snap = false;
-    vector<string> snap_names;
-    vector<uint64_t> snap_sizes;
-    vector<parent_info> snap_parents;
-    vector<uint8_t> snap_protection;
-    vector<uint64_t> snap_flags;
-    {
-      Mutex::Locker cache_locker(ictx->cache_lock);
-      RWLock::WLocker snap_locker(ictx->snap_lock);
-
-      {
-	int r;
-	RWLock::WLocker parent_locker(ictx->parent_lock);
-	ictx->lockers.clear();
-	if (ictx->old_format) {
-	  r = read_header(ictx->md_ctx, ictx->header_oid, &ictx->header, NULL);
-	  if (r < 0) {
-	    lderr(cct) << "Error reading header: " << cpp_strerror(r) << dendl;
-	    return r;
-	  }
-	  r = cls_client::old_snapshot_list(&ictx->md_ctx, ictx->header_oid,
-					    &snap_names, &snap_sizes, &new_snapc);
-	  if (r < 0) {
-	    lderr(cct) << "Error listing snapshots: " << cpp_strerror(r)
-		       << dendl;
-	    return r;
-	  }
-	  ClsLockType lock_type = LOCK_NONE;
-	  r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
-					      RBD_LOCK_NAME, &ictx->lockers,
-					      &lock_type, &ictx->lock_tag);
-
-	  // If EOPNOTSUPP, treat image as if there are no locks (we can't
-	  // query them).
-
-	  // Ugly: OSDs prior to eed28daaf8927339c2ecae1b1b06c1b63678ab03
-	  // return EIO when the class isn't present; should be EOPNOTSUPP.
-	  // Treat EIO or EOPNOTSUPP the same for now, as LOCK_NONE.  Blech.
-
-	  if (r < 0 && ((r != -EOPNOTSUPP) && (r != -EIO))) {
-	    lderr(cct) << "Error getting lock info: " << cpp_strerror(r)
-		       << dendl;
-	    return r;
-	  }
-	  ictx->exclusive_locked = (lock_type == LOCK_EXCLUSIVE);
-	  ictx->order = ictx->header.options.order;
-	  ictx->size = ictx->header.image_size;
-	  ictx->object_prefix = ictx->header.block_name;
-	  ictx->init_layout();
-	} else {
-	  do {
-	    uint64_t incompatible_features;
-	    bool read_only = ictx->read_only || ictx->snap_id != CEPH_NOSNAP;
-	    r = cls_client::get_mutable_metadata(&ictx->md_ctx, ictx->header_oid,
-						 read_only,
-						 &ictx->size, &ictx->features,
-						 &incompatible_features,
-						 &ictx->lockers,
-						 &ictx->exclusive_locked,
-						 &ictx->lock_tag,
-						 &new_snapc,
-						 &ictx->parent_md);
-	    if (r < 0) {
-	      lderr(cct) << "Error reading mutable metadata: " << cpp_strerror(r)
-			 << dendl;
-	      return r;
-	    }
-
-	    uint64_t unsupported = incompatible_features & ~RBD_FEATURES_ALL;
-	    if (unsupported) {
-	      lderr(ictx->cct) << "Image uses unsupported features: "
-			       << unsupported << dendl;
-	      return -ENOSYS;
-	    }
-
-	    r = cls_client::get_flags(&ictx->md_ctx, ictx->header_oid,
-				      &ictx->flags, new_snapc.snaps,
-				      &snap_flags);
-	    if (r == -EOPNOTSUPP || r == -EIO) {
-	      // Older OSD doesn't support RBD flags, need to assume the worst
-	      ldout(ictx->cct, 10) << "OSD does not support RBD flags"
-				   << "disabling object map optimizations"
-				   << dendl;
-	      ictx->flags = RBD_FLAG_OBJECT_MAP_INVALID;
-              if ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0) {
-                ictx->flags |= RBD_FLAG_FAST_DIFF_INVALID;
-              }
-
-	      vector<uint64_t> default_flags(new_snapc.snaps.size(), ictx->flags);
-	      snap_flags.swap(default_flags);
-            } else if (r == -ENOENT) {
-              ldout(ictx->cct, 10) << "Image at invalid snapshot" << dendl;
-	      continue;
-            } else if (r < 0) {
-              lderr(cct) << "Error reading flags: " << cpp_strerror(r) << dendl;
-              return r;
-            }
-
-	    r = cls_client::snapshot_list(&(ictx->md_ctx), ictx->header_oid,
-					  new_snapc.snaps, &snap_names,
-                                          &snap_sizes, &snap_parents,
-                                          &snap_protection);
-	    // -ENOENT here means we raced with snapshot deletion
-	    if (r < 0 && r != -ENOENT) {
-	      lderr(ictx->cct) << "snapc = " << new_snapc << dendl;
-	      lderr(ictx->cct) << "Error listing snapshots: " << cpp_strerror(r)
-			       << dendl;
-	      return r;
-	    }
-	  } while (r == -ENOENT);
-	}
-
-	for (size_t i = 0; i < new_snapc.snaps.size(); ++i) {
-	  parent_info parent;
-	  if (!ictx->old_format)
-	    parent = snap_parents[i];
-	  vector<snap_t>::const_iterator it =
-	    find(ictx->snaps.begin(), ictx->snaps.end(), new_snapc.snaps[i].val);
-	  if (it == ictx->snaps.end()) {
-	    new_snap = true;
-	    ldout(cct, 20) << "new snapshot id=" << new_snapc.snaps[i].val
-			   << " name=" << snap_names[i]
-			   << " size=" << snap_sizes[i]
-			   << dendl;
-	  }
-	}
-
-	ictx->snaps.clear();
-	ictx->snap_info.clear();
-	ictx->snap_ids.clear();
-	for (size_t i = 0; i < new_snapc.snaps.size(); ++i) {
-	  uint64_t flags = ictx->old_format ? 0 : snap_flags[i];
-	  uint8_t protection_status = ictx->old_format ?
-	    (uint8_t)RBD_PROTECTION_STATUS_UNPROTECTED : snap_protection[i];
-	  parent_info parent;
-	  if (!ictx->old_format)
-	    parent = snap_parents[i];
-	  ictx->add_snap(snap_names[i], new_snapc.snaps[i].val, snap_sizes[i],
-			 parent, protection_status, flags);
-	}
-
-	r = refresh_parent(ictx);
-	if (r < 0)
-	  return r;
-      } // release parent_lock
-
-      if (!new_snapc.is_valid()) {
-	lderr(cct) << "image snap context is invalid!" << dendl;
-	return -EIO;
-      }
-
-      ictx->snapc = new_snapc;
-
-      if (ictx->snap_id != CEPH_NOSNAP &&
-	  ictx->get_snap_id(ictx->snap_name) != ictx->snap_id) {
-	lderr(cct) << "tried to read from a snapshot that no longer exists: "
-		   << ictx->snap_name << dendl;
-	ictx->snap_exists = false;
-      }
-
-      ictx->object_map.refresh(ictx->snap_id);
-
-      ictx->data_ctx.selfmanaged_snap_set_write_ctx(ictx->snapc.seq, ictx->snaps);
-
-      // dynamically enable/disable journaling support
-      if ((ictx->features & RBD_FEATURE_JOURNALING) != 0 &&
-          ictx->image_watcher != NULL && ictx->journal == NULL &&
-          ictx->snap_name.empty()) {
-        ictx->open_journal();
-      } else if ((ictx->features & RBD_FEATURE_JOURNALING) == 0 &&
-                 ictx->journal != NULL) {
-        // TODO journal needs to be disabled via proxied request to avoid race
-        //      between deleting journal and appending journal events
-      }
-    } // release snap_lock and cache_lock
-
-    if (ictx->image_watcher != NULL) {
-      ictx->image_watcher->refresh();
-    }
-
-    if (new_snap) {
-      ictx->flush();
-    }
-    return 0;
-  }
-
   int snap_rollback(ImageCtx *ictx, const char *snap_name,
 		    ProgressContext& prog_ctx)
   {
@@ -2791,7 +2239,7 @@ reprotect_and_return_err:
     ldout(cct, 20) << "snap_rollback " << ictx << " snap = " << snap_name
 		   << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -2821,40 +2269,39 @@ reprotect_and_return_err:
       if (r < 0) {
 	return -EROFS;
       }
-      if (ictx->image_watcher->is_lock_supported() &&
-	  !ictx->image_watcher->is_lock_owner()) {
+      if (ictx->exclusive_lock != nullptr &&
+	  !ictx->exclusive_lock->is_lock_owner()) {
 	return -EROFS;
       }
 
-      ictx->snap_lock.get_read();
+      ictx->snap_lock.get_read();
+      if (ictx->journal != NULL) {
+        C_SaferCond journal_ctx;
+        ictx->journal->wait_for_journal_ready(&journal_ctx);
+
+        ictx->snap_lock.put_read();
+        r = journal_ctx.wait();
+        if (r < 0) {
+          lderr(cct) << "Failed to initialize journal: " << cpp_strerror(r)
+                     << dendl;
+          return r;
+        }
+
+        ictx->snap_lock.get_read();
+      }
+
       new_size = ictx->get_image_size(snap_id);
       ictx->snap_lock.put_read();
-
-      // need to flush any pending writes before resizing and rolling back -
-      // writes might create new snapshots. Rolling back will replace
-      // the current version, so we have to invalidate that too.
-      RWLock::WLocker md_locker(ictx->md_lock);
-      r = ictx->invalidate_cache();
-      if (r < 0) {
-	return r;
-      }
-    }
-
-    ldout(cct, 2) << "resizing to snapshot size..." << dendl;
-    NoOpProgressContext no_op;
-    C_SaferCond ctx;
-    async_resize_helper(ictx, &ctx, new_size, no_op);
-
-    r = ctx.wait();
-    if (r < 0) {
-      lderr(cct) << "Error resizing to snapshot size: "
-		 << cpp_strerror(r) << dendl;
-      return r;
     }
 
-    r = rollback_image(ictx, snap_id, prog_ctx);
+    // TODO need to wait for journal replay to complete (if enabled)
+    C_SaferCond cond_ctx;
+    operation::SnapshotRollbackRequest<> *request =
+      new operation::SnapshotRollbackRequest<>(*ictx, &cond_ctx, snap_name,
+                                               snap_id, new_size, prog_ctx);
+    request->send();
+    r = cond_ctx.wait();
     if (r < 0) {
-      lderr(cct) << "Error rolling back image: " << cpp_strerror(-r) << dendl;
       return r;
     }
 
@@ -2880,30 +2327,37 @@ reprotect_and_return_err:
     CephContext *cct = (CephContext *)dest_md_ctx.cct();
     ldout(cct, 20) << "copy " << src->name
 		   << (src->snap_name.length() ? "@" + src->snap_name : "")
-		   << " -> " << destname << dendl;
+		   << " -> " << destname << " opts = " << opts << dendl;
 
     src->snap_lock.get_read();
     uint64_t features = src->features;
     uint64_t src_size = src->get_image_size(src->snap_id);
     src->snap_lock.put_read();
-    uint64_t stripe_unit = src->stripe_unit;
-    uint64_t stripe_count = src->stripe_count;
-    opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
-    opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit);
-    opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count);
-    int order = src->order;
-    uint64_t opt_order = 0;
-    if (opts.get(RBD_IMAGE_OPTION_ORDER, &opt_order)) {
-      order = opt_order;
+    if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+      opts.set(RBD_IMAGE_OPTION_FEATURES, features);
     }
-
     if (features & ~RBD_FEATURES_ALL) {
       lderr(cct) << "librbd does not support requested features" << dendl;
       return -ENOSYS;
     }
+    uint64_t format = src->old_format ? 1 : 2;
+    if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
+      opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+    }
+    uint64_t stripe_unit = src->stripe_unit;
+    if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
+      opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+    }
+    uint64_t stripe_count = src->stripe_count;
+    if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
+      opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+    }
+    uint64_t order = src->order;
+    if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+      opts.set(RBD_IMAGE_OPTION_ORDER, order);
+    }
 
-    int r = create(dest_md_ctx, destname, src_size, src->old_format,
-		   features, &order, stripe_unit, stripe_count);
+    int r = create(dest_md_ctx, destname, src_size, opts);
     if (r < 0) {
       lderr(cct) << "header creation failed" << dendl;
       return r;
@@ -2912,14 +2366,14 @@ reprotect_and_return_err:
 
     ImageCtx *dest = new librbd::ImageCtx(destname, "", NULL,
 					  dest_md_ctx, false);
-    r = open_image(dest);
+    r = dest->state->open();
     if (r < 0) {
       lderr(cct) << "failed to read newly created header" << dendl;
       return r;
     }
 
     r = copy(src, dest, prog_ctx);
-    int close_r = close_image(dest);
+    int close_r = dest->state->close();
     if (r == 0 && close_r < 0) {
       r = close_r;
     }
@@ -2963,7 +2417,7 @@ reprotect_and_return_err:
       }
 
       Context *ctx = new C_CopyWrite(m_throttle, m_bl);
-      AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
+      AioCompletion *comp = AioCompletion::create(ctx);
 
       // coordinate through AIO WQ to ensure lock is acquired if needed
       m_dest->aio_work_queue->aio_write(comp, m_offset, m_bl->length(),
@@ -3021,7 +2475,7 @@ reprotect_and_return_err:
       uint64_t len = min(period, src_size - offset);
       bufferlist *bl = new bufferlist();
       Context *ctx = new C_CopyRead(&throttle, dest, offset, bl);
-      AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
+      AioCompletion *comp = AioCompletion::create(ctx);
       AioImageRequest::aio_read(src, comp, offset, len, NULL, bl,
                                 fadvise_flags);
       prog_ctx.update_progress(offset, src_size);
@@ -3033,29 +2487,6 @@ reprotect_and_return_err:
     return r;
   }
 
-  // common snap_set functionality for snap_set and open_image
-
-  int _snap_set(ImageCtx *ictx, const char *snap_name)
-  {
-    RWLock::WLocker owner_locker(ictx->owner_lock);
-    RWLock::RLocker md_locker(ictx->md_lock);
-    Mutex::Locker cache_locker(ictx->cache_lock);
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    RWLock::WLocker parent_locker(ictx->parent_lock);
-    int r;
-    if ((snap_name != NULL) && (strlen(snap_name) != 0)) {
-      r = ictx->snap_set(snap_name);
-    } else {
-      ictx->snap_unset();
-      r = 0;
-    }
-    if (r < 0) {
-      return r;
-    }
-    refresh_parent(ictx);
-    return 0;
-  }
-
   int snap_set(ImageCtx *ictx, const char *snap_name)
   {
     ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = "
@@ -3063,191 +2494,22 @@ reprotect_and_return_err:
 
     // ignore return value, since we may be set to a non-existent
     // snapshot and the user is trying to fix that
-    ictx_check(ictx);
-
-    int r;
-    bool snapshot_mode = (snap_name != NULL && strlen(snap_name) != 0);
-    if (snapshot_mode) {
-      {
-        RWLock::WLocker owner_locker(ictx->owner_lock);
-        if (ictx->image_watcher != NULL &&
-            ictx->image_watcher->is_lock_owner()) {
-          r = ictx->image_watcher->release_lock();
-          if (r < 0) {
-            return r;
-          }
-        }
-      }
-
-      ictx->cancel_async_requests();
-      {
-        RWLock::RLocker owner_locker(ictx->owner_lock);
-        r = ictx->flush();
-      }
+    ictx->state->refresh_if_required();
 
-      {
-        RWLock::WLocker snap_locker(ictx->snap_lock);
-        if (ictx->journal != NULL) {
-          r = ictx->close_journal(false);
-          if (r < 0) {
-            return r;
-          }
-        }
-      }
-    }
+    C_SaferCond ctx;
+    std::string name(snap_name == nullptr ? "" : snap_name);
+    ictx->state->snap_set(name, &ctx);
 
-    r = _snap_set(ictx, snap_name);
+    int r = ctx.wait();
     if (r < 0) {
-      return r;
-    }
-
-    {
-      RWLock::WLocker snap_locker(ictx->snap_lock);
-      if ((ictx->features & RBD_FEATURE_JOURNALING) != 0 &&
-          ictx->journal == NULL && !snapshot_mode) {
-        ictx->open_journal();
-      }
-    }
-
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    if (ictx->image_watcher != NULL) {
-      ictx->image_watcher->refresh();
-    }
-    return r;
-  }
-
-  int open_image(ImageCtx *ictx)
-  {
-    ldout(ictx->cct, 20) << "open_image: ictx = " << ictx
-			 << " name = '" << ictx->name
-			 << "' id = '" << ictx->id
-			 << "' snap_name = '"
-			 << ictx->snap_name << "'" << dendl;
-    int r = ictx->init();
-    if (r < 0)
-      goto err_close;
-
-    if (!ictx->read_only) {
-      r = ictx->register_watch();
-      if (r < 0) {
-	lderr(ictx->cct) << "error registering a watch: " << cpp_strerror(r)
-			 << dendl;
-	goto err_close;
+      if (r != -ENOENT) {
+        lderr(ictx->cct) << "failed to " << (name.empty() ? "un" : "") << "set "
+                         << "snapshot: " << cpp_strerror(r) << dendl;
       }
-    }
-
-    {
-      RWLock::RLocker owner_locker(ictx->owner_lock);
-      r = ictx_refresh(ictx);
-    }
-    if (r < 0)
-      goto err_close;
-
-    if ((r = _snap_set(ictx, ictx->snap_name.c_str())) < 0)
-      goto err_close;
-
-    if (ictx->image_watcher != NULL) {
-      RWLock::RLocker owner_locker(ictx->owner_lock);
-      ictx->image_watcher->refresh();
+      return r;
     }
 
     return 0;
-
-  err_close:
-    close_image(ictx);
-    return r;
-  }
-
-  int close_image(ImageCtx *ictx)
-  {
-    ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
-
-    if (!ictx->read_only) {
-      // finish all incoming IO operations
-      ictx->aio_work_queue->drain();
-    }
-
-    int r = 0;
-    {
-      // release the lock (and flush all in-flight IO)
-      RWLock::WLocker owner_locker(ictx->owner_lock);
-      if (ictx->image_watcher != NULL && ictx->image_watcher->is_lock_owner()) {
-        r = ictx->image_watcher->release_lock();
-        if (r < 0) {
-          lderr(ictx->cct) << "error releasing image lock: " << cpp_strerror(r)
-                           << dendl;
-        }
-      }
-    }
-
-    assert(!ictx->aio_work_queue->writes_blocked() ||
-           ictx->aio_work_queue->writes_empty());
-
-    ictx->cancel_async_requests();
-    ictx->flush_async_operations();
-    ictx->readahead.wait_for_pending();
-
-    if (ictx->object_cacher) {
-      int flush_r = ictx->shutdown_cache(); // implicitly flushes
-      if (flush_r < 0) {
-        lderr(ictx->cct) << "error flushing IO: " << cpp_strerror(flush_r)
-                         << dendl;
-        if (r == 0) {
-          r = flush_r;
-        }
-      }
-    }
-
-    ictx->op_work_queue->drain();
-
-    if (ictx->copyup_finisher != NULL) {
-      ictx->copyup_finisher->wait_for_empty();
-      ictx->copyup_finisher->stop();
-    }
-
-    if (ictx->journal != NULL) {
-      int close_r = ictx->close_journal(true);
-      if (close_r < 0 && r == 0) {
-        r = close_r;
-      }
-    }
-
-    if (ictx->parent) {
-      RWLock::WLocker parent_locker(ictx->parent_lock);
-      int close_r = close_parent(ictx);
-      if (r == 0 && close_r < 0) {
-        r = close_r;
-      }
-    }
-
-    if (ictx->image_watcher) {
-      ictx->unregister_watch();
-    }
-
-    delete ictx;
-    return r;
-  }
-
-  int close_parent(ImageCtx *ictx)
-  {
-    assert(ictx->parent_lock.is_wlocked());
-    ImageCtx *parent_ictx = ictx->parent;
-
-    // AIO to the parent must be complete before closing
-    parent_ictx->flush_async_operations();
-    parent_ictx->readahead.wait_for_pending();
-    {
-      Mutex::Locker async_ops_locker(parent_ictx->async_ops_lock);
-      assert(parent_ictx->async_ops.empty());
-    }
-
-    // attempting to drain the work queues might result in deadlock
-    assert(parent_ictx->aio_work_queue->empty());
-    assert(parent_ictx->op_work_queue->empty());
-
-    int r = close_image(parent_ictx);
-    ictx->parent = NULL;
-    return r;
   }
 
   // 'flatten' child image by copying all parent's blocks
@@ -3256,7 +2518,7 @@ reprotect_and_return_err:
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "flatten" << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3290,22 +2552,15 @@ reprotect_and_return_err:
     return 0;
   }
 
-  int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx)
+  void async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx)
   {
     assert(ictx->owner_lock.is_locked());
-    assert(!ictx->image_watcher->is_lock_supported() ||
-	   ictx->image_watcher->is_lock_owner());
+    assert(ictx->exclusive_lock == nullptr ||
+	   ictx->exclusive_lock->is_lock_owner());
 
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "flatten" << dendl;
 
-    int r;
-    // ictx_check also updates parent data
-    if ((r = ictx_check(ictx, ictx->owner_lock)) < 0) {
-      lderr(cct) << "ictx_check failed" << dendl;
-      return r;
-    }
-
     uint64_t object_size;
     uint64_t overlap_objects;
     ::SnapContext snapc;
@@ -3316,22 +2571,25 @@ reprotect_and_return_err:
       RWLock::RLocker l2(ictx->parent_lock);
 
       if (ictx->read_only) {
-        return -EROFS;
+        ctx->complete(-EROFS);
+        return;
       }
 
       // can't flatten a non-clone
       if (ictx->parent_md.spec.pool_id == -1) {
 	lderr(cct) << "image has no parent" << dendl;
-	return -EINVAL;
+        ctx->complete(-EINVAL);
+	return;
       }
       if (ictx->snap_id != CEPH_NOSNAP) {
 	lderr(cct) << "snapshots cannot be flattened" << dendl;
-	return -EROFS;
+        ctx->complete(-EROFS);
+	return;
       }
 
       snapc = ictx->snapc;
       assert(ictx->parent != NULL);
-      r = ictx->get_parent_overlap(CEPH_NOSNAP, &overlap);
+      int r = ictx->get_parent_overlap(CEPH_NOSNAP, &overlap);
       assert(r == 0);
       assert(overlap <= ictx->size);
 
@@ -3339,18 +2597,16 @@ reprotect_and_return_err:
       overlap_objects = Striper::get_num_objects(ictx->layout, overlap);
     }
 
-    AsyncFlattenRequest *req =
-      new AsyncFlattenRequest(*ictx, ctx, object_size, overlap_objects,
-			      snapc, prog_ctx);
+    operation::FlattenRequest<> *req = new operation::FlattenRequest<>(
+      *ictx, ctx, object_size, overlap_objects, snapc, prog_ctx);
     req->send();
-    return 0;
   }
 
   int rebuild_object_map(ImageCtx *ictx, ProgressContext &prog_ctx) {
     CephContext *cct = ictx->cct;
     ldout(cct, 10) << "rebuild_object_map" << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3370,31 +2626,27 @@ reprotect_and_return_err:
     return r;
   }
 
-  int async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
-                               ProgressContext &prog_ctx) {
+  void async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
+                                ProgressContext &prog_ctx) {
     assert(ictx->owner_lock.is_locked());
-    assert(!ictx->image_watcher->is_lock_supported() ||
-	   ictx->image_watcher->is_lock_owner());
+    assert(ictx->exclusive_lock == nullptr ||
+	   ictx->exclusive_lock->is_lock_owner());
 
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "async_rebuild_object_map " << ictx << dendl;
 
     if (ictx->read_only) {
-      return -EROFS;
+      ctx->complete(-EROFS);
+      return;
     }
     if (!ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
-      return -EINVAL;
-    }
-
-    int r = ictx_check(ictx, ictx->owner_lock);
-    if (r < 0) {
-      return r;
+      ctx->complete(-EINVAL);
+      return;
     }
 
-    RebuildObjectMapRequest *req = new RebuildObjectMapRequest(*ictx, ctx,
-                                                               prog_ctx);
+    operation::RebuildObjectMapRequest<> *req =
+      new operation::RebuildObjectMapRequest<>(*ictx, ctx, prog_ctx);
     req->send();
-    return 0;
   }
 
   int list_lockers(ImageCtx *ictx,
@@ -3404,7 +2656,7 @@ reprotect_and_return_err:
   {
     ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -3436,7 +2688,7 @@ reprotect_and_return_err:
 			 << " cookie='" << cookie << "' tag='" << tag << "'"
 			 << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -3445,12 +2697,16 @@ reprotect_and_return_err:
      * checks that we think we will succeed. But for now, let's not
      * duplicate that code.
      */
-    RWLock::RLocker locker(ictx->md_lock);
-    r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
-			       exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED,
-			       cookie, tag, "", utime_t(), 0);
-    if (r < 0)
-      return r;
+    {
+      RWLock::RLocker locker(ictx->md_lock);
+      r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
+			         exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED,
+			         cookie, tag, "", utime_t(), 0);
+      if (r < 0) {
+        return r;
+      }
+    }
+
     notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     return 0;
   }
@@ -3460,16 +2716,19 @@ reprotect_and_return_err:
     ldout(ictx->cct, 20) << "unlock image " << ictx
 			 << " cookie='" << cookie << "'" << dendl;
 
-
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
-    RWLock::RLocker locker(ictx->md_lock);
-    r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
-				 RBD_LOCK_NAME, cookie);
-    if (r < 0)
-      return r;
+    {
+      RWLock::RLocker locker(ictx->md_lock);
+      r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
+				   RBD_LOCK_NAME, cookie);
+      if (r < 0) {
+        return r;
+      }
+    }
+
     notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     return 0;
   }
@@ -3480,7 +2739,7 @@ reprotect_and_return_err:
     ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
 			 << "' cookie='" << cookie << "'" << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -3517,7 +2776,7 @@ reprotect_and_return_err:
       if (client_address.empty()) {
         return -ENOENT;
       }
-      
+
       RWLock::RLocker locker(ictx->md_lock);
       librados::Rados rados(ictx->md_ctx);
       r = rados.blacklist_add(client_address,
@@ -3554,7 +2813,7 @@ reprotect_and_return_err:
     ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
 			 << " len = " << len << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0)
       return r;
 
@@ -3577,26 +2836,19 @@ reprotect_and_return_err:
 
       bufferlist bl;
 
-      Mutex mylock("IoCtxImpl::write::mylock");
-      Cond cond;
-      bool done;
-      int ret;
-
-      Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
-      AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
+      C_SaferCond ctx;
+      AioCompletion *c = AioCompletion::create(&ctx);
       AioImageRequest::aio_read(ictx, c, off, read_len, NULL, &bl, 0);
 
-      mylock.Lock();
-      while (!done)
-	cond.Wait(mylock);
-      mylock.Unlock();
-
-      if (ret < 0)
-	return ret;
+      int ret = ctx.wait();
+      if (ret < 0) {
+        return ret;
+      }
 
       r = cb(total_read, ret, bl.c_str(), arg);
-      if (r < 0)
+      if (r < 0) {
 	return r;
+      }
 
       total_read += ret;
       left -= ret;
@@ -3623,7 +2875,7 @@ reprotect_and_return_err:
       ictx->flush();
     }
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3641,18 +2893,6 @@ reprotect_and_return_err:
     return r;
   }
 
-  void rados_req_cb(rados_completion_t c, void *arg)
-  {
-    AioObjectRequest *req = reinterpret_cast<AioObjectRequest *>(arg);
-    req->complete(rados_aio_get_return_value(c));
-  }
-
-  void rados_ctx_cb(rados_completion_t c, void *arg)
-  {
-    Context *comp = reinterpret_cast<Context *>(arg);
-    comp->complete(rados_aio_get_return_value(c));
-  }
-
   // validate extent against image size; clip to image size if necessary
   int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len)
   {
@@ -3683,7 +2923,7 @@ reprotect_and_return_err:
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "flush " << ictx << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3702,7 +2942,7 @@ reprotect_and_return_err:
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3714,12 +2954,29 @@ reprotect_and_return_err:
     return r;
   }
 
+  int poll_io_events(ImageCtx *ictx, AioCompletion **comps, int numcomp)
+  {
+    if (numcomp <= 0)
+      return -EINVAL;
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp << dendl;
+    int i = 0;
+    Mutex::Locker l(ictx->completed_reqs_lock);
+    while (i < numcomp) {
+      if (ictx->completed_reqs.empty())
+        break;
+      comps[i++] = ictx->completed_reqs.front();
+      ictx->completed_reqs.pop_front();
+    }
+    return i;
+  }
+
   int metadata_get(ImageCtx *ictx, const string &key, string *value)
   {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3732,7 +2989,7 @@ reprotect_and_return_err:
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "metadata_set " << ictx << " key=" << key << " value=" << value << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3747,7 +3004,7 @@ reprotect_and_return_err:
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "metadata_remove " << ictx << " key=" << key << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3760,7 +3017,7 @@ reprotect_and_return_err:
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "metadata_list " << ictx << dendl;
 
-    int r = ictx_check(ictx);
+    int r = ictx->state->refresh_if_required();
     if (r < 0) {
       return r;
     }
@@ -3768,6 +3025,130 @@ reprotect_and_return_err:
     return cls_client::metadata_list(&ictx->md_ctx, ictx->header_oid, start, max, pairs);
   }
 
+  int mirror_is_enabled(IoCtx& io_ctx, bool *enabled) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << dendl;
+
+    int r = cls_client::mirror_is_enabled(&io_ctx, enabled);
+    if (r < 0) {
+      lderr(cct) << "Failed to retrieve mirror flag: " << cpp_strerror(r)
+                 << dendl;
+      return r;
+    }
+    return 0;
+  }
+
+  int mirror_set_enabled(IoCtx& io_ctx, bool enabled) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << ": enabled=" << enabled << dendl;
+
+    int r = cls_client::mirror_set_enabled(&io_ctx, enabled);
+    if (r < 0 && r != -ENOENT) {
+      lderr(cct) << "Failed to set mirror flag: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+
+  int mirror_peer_add(IoCtx& io_ctx, const std::string &cluster_uuid,
+                      const std::string &cluster_name,
+                      const std::string &client_name) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << ": uuid=" << cluster_uuid << ", "
+                   << "name=" << cluster_name << ", "
+                   << "client=" << client_name << dendl;
+
+    std::string local_cluster_uuid;
+    librados::Rados rados(io_ctx);
+    int r = rados.cluster_fsid(&local_cluster_uuid);
+    if (r < 0) {
+      lderr(cct) << "Failed to retreive cluster uuid" << dendl;
+      return r;
+    }
+
+    if (local_cluster_uuid == cluster_uuid) {
+      lderr(cct) << "Cannot add self as remote peer" << dendl;
+      return -EINVAL;
+    }
+
+    r = cls_client::mirror_peer_add(&io_ctx, cluster_uuid, cluster_name,
+                                    client_name);
+    if (r < 0) {
+      lderr(cct) << "Failed to add mirror peer '" << cluster_uuid << "': "
+                 << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+
+  int mirror_peer_remove(IoCtx& io_ctx, const std::string &cluster_uuid) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << ": uuid=" << cluster_uuid << dendl;
+
+    int r = cls_client::mirror_peer_remove(&io_ctx, cluster_uuid);
+    if (r < 0 && r != -ENOENT) {
+      lderr(cct) << "Failed to remove peer '" << cluster_uuid << "': "
+                 << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+
+  int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << dendl;
+
+    std::vector<cls::rbd::MirrorPeer> mirror_peers;
+    int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers);
+    if (r < 0 && r != -ENOENT) {
+      lderr(cct) << "Failed to list peers: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    peers->clear();
+    peers->reserve(mirror_peers.size());
+    for (auto &mirror_peer : mirror_peers) {
+      mirror_peer_t peer;
+      peer.cluster_uuid = mirror_peer.cluster_uuid;
+      peer.cluster_name = mirror_peer.cluster_name;
+      peer.client_name = mirror_peer.client_name;
+      peers->push_back(peer);
+    }
+    return 0;
+  }
+
+  int mirror_peer_set_client(IoCtx& io_ctx, const std::string &cluster_uuid,
+                             const std::string &client_name) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << ": uuid=" << cluster_uuid << ", "
+                   << "client=" << client_name << dendl;
+
+    int r = cls_client::mirror_peer_set_client(&io_ctx, cluster_uuid,
+                                               client_name);
+    if (r < 0) {
+      lderr(cct) << "Failed to update client '" << cluster_uuid << "': "
+                 << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+
+  int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &cluster_uuid,
+                              const std::string &cluster_name) {
+    CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+    ldout(cct, 20) << __func__ << ": uuid=" << cluster_uuid << ", "
+                   << "cluster=" << cluster_name << dendl;
+
+    int r = cls_client::mirror_peer_set_cluster(&io_ctx, cluster_uuid,
+                                                cluster_name);
+    if (r < 0) {
+      lderr(cct) << "Failed to update cluster '" << cluster_uuid << "': "
+                 << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+
   void rbd_req_cb(completion_t cb, void *arg)
   {
     AioObjectRequest *req = reinterpret_cast<AioObjectRequest *>(arg);
@@ -3832,22 +3213,4 @@ reprotect_and_return_err:
       ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length);
     }
   }
-
-  AioCompletion *aio_create_completion() {
-    AioCompletion *c = new AioCompletion();
-    return c;
-  }
-
-  AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete) {
-    AioCompletion *c = new AioCompletion();
-    c->set_complete_cb(cb_arg, cb_complete);
-    return c;
-  }
-
-  AioCompletion *aio_create_completion_internal(void *cb_arg,
-						callback_t cb_complete) {
-    AioCompletion *c = aio_create_completion(cb_arg, cb_complete);
-    c->rbd_comp = c;
-    return c;
-  }
 }
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 3de90a6..81368a8 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/rbd/librbd.hpp"
 #include "include/rbd_types.h"
 
@@ -48,6 +48,7 @@ enum {
 };
 
 class Context;
+class RWLock;
 class SimpleThrottle;
 
 namespace librbd {
@@ -67,16 +68,12 @@ namespace librbd {
     }
   };
 
-  const std::string id_obj_name(const std::string &name);
-  const std::string header_name(const std::string &image_id);
-  const std::string old_header_name(const std::string &image_name);
-  std::string unique_lock_name(const std::string &name, void *address);
-
   int detect_format(librados::IoCtx &io_ctx, const std::string &name,
 		    bool *old_format, uint64_t *size);
 
   bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap);
 
+  std::string image_option_name(int optname);
   void image_options_create(rbd_image_options_t* opts);
   void image_options_create_ref(rbd_image_options_t* opts,
 				rbd_image_options_t orig);
@@ -110,6 +107,7 @@ namespace librbd {
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
 	    IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts);
   int rename(librados::IoCtx& io_ctx, const char *srcname, const char *dstname);
+  void rename_helper(ImageCtx *ictx, Context *ctx, const char *dstname);
   int info(ImageCtx *ictx, image_info_t& info, size_t image_size);
   int get_old_format(ImageCtx *ictx, uint8_t *old);
   int get_size(ImageCtx *ictx, uint64_t *size);
@@ -119,42 +117,34 @@ namespace librbd {
   int get_parent_info(ImageCtx *ictx, std::string *parent_pool_name,
 		      std::string *parent_name, std::string *parent_snap_name);
   int get_flags(ImageCtx *ictx, uint64_t *flags);
+  int set_image_notification(ImageCtx *ictx, int fd, int type);
   int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner);
 
   int remove(librados::IoCtx& io_ctx, const char *imgname,
 	     ProgressContext& prog_ctx);
   int resize(ImageCtx *ictx, uint64_t size, ProgressContext& prog_ctx);
   int snap_create(ImageCtx *ictx, const char *snap_name);
-  int snap_create_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
+  void snap_create_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
   int snap_list(ImageCtx *ictx, std::vector<snap_info_t>& snaps);
   bool snap_exists(ImageCtx *ictx, const char *snap_name);
   int snap_rollback(ImageCtx *ictx, const char *snap_name,
 		    ProgressContext& prog_ctx);
   int snap_remove(ImageCtx *ictx, const char *snap_name);
-  int snap_remove_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
-  int snap_rename_helper(ImageCtx *ictx, Context* ctx, const uint64_t src_snap_id,
-			 const char *dst_name);
+  void snap_remove_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
   int snap_rename(ImageCtx *ictx, const char *srcname, const char *dstname);
+  void snap_rename_helper(ImageCtx *ictx, Context* ctx,
+                          const uint64_t src_snap_id, const char *dst_name);
   int snap_protect(ImageCtx *ictx, const char *snap_name);
+  void snap_protect_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
   int snap_unprotect(ImageCtx *ictx, const char *snap_name);
+  void snap_unprotect_helper(ImageCtx *ictx, Context* ctx,
+                             const char *snap_name);
   int snap_is_protected(ImageCtx *ictx, const char *snap_name,
 			bool *is_protected);
-  int add_snap(ImageCtx *ictx, const char *snap_name);
-  int rm_snap(ImageCtx *ictx, const char *snap_name, uint64_t snap_id);
-  int rename_snap(ImageCtx *ictx, uint64_t src_snap_id, const char *dst_name);
-  int refresh_parent(ImageCtx *ictx);
-  int ictx_check(ImageCtx *ictx);
-  int ictx_check(ImageCtx *ictx, const RWLock &owner_lock);
-  int ictx_refresh(ImageCtx *ictx);
   int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
 	   ImageOptions& opts, ProgressContext &prog_ctx);
   int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx);
 
-  int open_parent(ImageCtx *ictx);
-  int open_image(ImageCtx *ictx);
-  int close_image(ImageCtx *ictx);
-  int close_parent(ImageCtx *ictx);
-
   int flatten(ImageCtx *ictx, ProgressContext &prog_ctx);
 
   int rebuild_object_map(ImageCtx *ictx, ProgressContext &prog_ctx);
@@ -182,10 +172,6 @@ namespace librbd {
 		  struct rbd_obj_header_ondisk *header, uint64_t *ver);
   int tmap_set(librados::IoCtx& io_ctx, const std::string& imgname);
   int tmap_rm(librados::IoCtx& io_ctx, const std::string& imgname);
-  void rollback_object(ImageCtx *ictx, uint64_t snap_id, const string& oid,
-                       SimpleThrottle& throttle);
-  int rollback_image(ImageCtx *ictx, uint64_t snap_id,
-		     ProgressContext& prog_ctx);
   void image_info(const ImageCtx *ictx, image_info_t& info, size_t info_size);
   uint64_t oid_to_object_no(const std::string& oid,
 			    const std::string& object_prefix);
@@ -203,31 +189,32 @@ namespace librbd {
   void readahead(ImageCtx *ictx,
                  const vector<pair<uint64_t,uint64_t> >& image_extents);
 
-  int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx);
-  int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size,
-		   ProgressContext &prog_ctx);
-  void async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t new_size,
-                           ProgressContext& prog_ctx);
-  int async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
-                               ProgressContext &prog_ctx);
+  void async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx);
+  void async_resize(ImageCtx *ictx, Context *ctx, uint64_t size,
+                    ProgressContext &prog_ctx);
+  void async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
+                                ProgressContext &prog_ctx);
 
   int flush(ImageCtx *ictx);
   int invalidate_cache(ImageCtx *ictx);
+  int poll_io_events(ImageCtx *ictx, AioCompletion **comps, int numcomp);
   int metadata_list(ImageCtx *ictx, const string &last, uint64_t max, map<string, bufferlist> *pairs);
   int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value);
   int metadata_set(ImageCtx *ictx, const std::string &key, const std::string &value);
   int metadata_remove(ImageCtx *ictx, const std::string &key);
 
-  AioCompletion *aio_create_completion();
-  AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete);
-  AioCompletion *aio_create_completion_internal(void *cb_arg,
-						callback_t cb_complete);
+  int mirror_is_enabled(IoCtx& io_ctx, bool *enabled);
+  int mirror_set_enabled(IoCtx& io_ctx, bool enabled);
+  int mirror_peer_add(IoCtx& io_ctx, const std::string &cluster_uuid,
+                      const std::string &cluster_name,
+                      const std::string &client_name);
+  int mirror_peer_remove(IoCtx& io_ctx, const std::string &cluster_uuid);
+  int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers);
+  int mirror_peer_set_client(IoCtx& io_ctx, const std::string &cluster_uuid,
+                             const std::string &client_name);
+  int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &cluster_uuid,
+                              const std::string &cluster_name);
 
-  // raw callbacks
-  void rados_req_cb(rados_completion_t cb, void *arg);
-  void rados_ctx_cb(rados_completion_t cb, void *arg);
-  void rbd_req_cb(completion_t cb, void *arg);
-  void rbd_ctx_cb(completion_t cb, void *arg);
 }
 
 #endif
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index e322ad5..d4edc22 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -22,13 +22,13 @@
 #include "common/perf_counters.h"
 #include "common/TracepointProvider.h"
 #include "include/Context.h"
-#include "include/rbd/librbd.hpp"
 #include "osdc/ObjectCacher.h"
 
 #include "librbd/AioCompletion.h"
 #include "librbd/AioImageRequestWQ.h"
 #include "cls/rbd/cls_rbd_client.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
 #include "librbd/internal.h"
 #include "librbd/LibrbdWriteback.h"
 
@@ -121,12 +121,13 @@ namespace librbd {
     tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
 
     if (image.ctx != NULL) {
-      close_image(reinterpret_cast<ImageCtx*>(image.ctx));
+      reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
       image.ctx = NULL;
     }
 
-    int r = librbd::open_image(ictx);
+    int r = ictx->state->open();
     if (r < 0) {
+      delete ictx;
       tracepoint(librbd, open_image_exit, r);
       return r;
     }
@@ -144,12 +145,13 @@ namespace librbd {
     tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
 
     if (image.ctx != NULL) {
-      close_image(reinterpret_cast<ImageCtx*>(image.ctx));
+      reinterpret_cast<ImageCtx*>(image.ctx)->state->close();
       image.ctx = NULL;
     }
 
-    int r = librbd::open_image(ictx);
+    int r = ictx->state->open();
     if (r < 0) {
+      delete ictx;
       tracepoint(librbd, open_image_exit, r);
       return r;
     }
@@ -278,12 +280,45 @@ namespace librbd {
     return r;
   }
 
+  int RBD::mirror_is_enabled(IoCtx& io_ctx, bool *enabled) {
+    return librbd::mirror_is_enabled(io_ctx, enabled);
+  }
+
+  int RBD::mirror_set_enabled(IoCtx& io_ctx, bool enabled) {
+    return librbd::mirror_set_enabled(io_ctx, enabled);
+  }
+
+  int RBD::mirror_peer_add(IoCtx& io_ctx, const std::string &cluster_uuid,
+                           const std::string &cluster_name,
+                           const std::string &client_name) {
+    return librbd::mirror_peer_add(io_ctx, cluster_uuid, cluster_name,
+                                   client_name);
+  }
+
+  int RBD::mirror_peer_remove(IoCtx& io_ctx, const std::string &cluster_uuid) {
+    return librbd::mirror_peer_remove(io_ctx, cluster_uuid);
+  }
+
+  int RBD::mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers) {
+    return librbd::mirror_peer_list(io_ctx, peers);
+  }
+
+  int RBD::mirror_peer_set_client(IoCtx& io_ctx,
+                                  const std::string &cluster_uuid,
+                                  const std::string &client_name) {
+    return librbd::mirror_peer_set_client(io_ctx, cluster_uuid, client_name);
+  }
+
+  int RBD::mirror_peer_set_cluster(IoCtx& io_ctx,
+                                   const std::string &cluster_uuid,
+                                   const std::string &cluster_name) {
+    return librbd::mirror_peer_set_cluster(io_ctx, cluster_uuid, cluster_name);
+  }
+
   RBD::AioCompletion::AioCompletion(void *cb_arg, callback_t complete_cb)
   {
-    librbd::AioCompletion *c = librbd::aio_create_completion(cb_arg,
-							     complete_cb);
-    pc = (void *)c;
-    c->rbd_comp = this;
+    pc = reinterpret_cast<void*>(librbd::AioCompletion::create(
+      cb_arg, complete_cb, this));
   }
 
   bool RBD::AioCompletion::is_complete()
@@ -304,6 +339,12 @@ namespace librbd {
     return c->get_return_value();
   }
 
+  void *RBD::AioCompletion::get_arg()
+  {
+    librbd::AioCompletion *c = (librbd::AioCompletion *)pc;
+    return c->get_arg();
+  }
+
   void RBD::AioCompletion::release()
   {
     librbd::AioCompletion *c = (librbd::AioCompletion *)pc;
@@ -384,8 +425,10 @@ namespace librbd {
     if (ctx) {
       ImageCtx *ictx = (ImageCtx *)ctx;
       tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
-      r = close_image(ictx);
+
+      r = ictx->state->close();
       ctx = NULL;
+
       tracepoint(librbd, close_image_exit, r);
     }
     return r;
@@ -502,6 +545,15 @@ namespace librbd {
     return r;
   }
 
+  int Image::set_image_notification(int fd, int type)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, set_image_notification_enter, ictx, fd, type);
+    int r = librbd::set_image_notification(ictx, fd, type);
+    tracepoint(librbd, set_image_notification_exit, ictx, r);
+    return r;
+  }
+
   int Image::is_exclusive_lock_owner(bool *is_owner)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
@@ -831,7 +883,7 @@ namespace librbd {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
                ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
-               true, true);
+               true, false);
     int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, true, false, cb,
                                  arg);
     tracepoint(librbd, diff_iterate_exit, r);
@@ -982,6 +1034,20 @@ namespace librbd {
     return r;
   }
 
+  int Image::poll_io_events(RBD::AioCompletion **comps, int numcomp)
+  {
+    AioCompletion *cs[numcomp];
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, poll_io_events_enter, ictx, numcomp);
+    int r = librbd::poll_io_events(ictx, cs, numcomp);
+    tracepoint(librbd, poll_io_events_exit, r);
+    if (r > 0) {
+      for (int i = 0; i < numcomp; ++i)
+        comps[i] = (RBD::AioCompletion *)cs[i]->rbd_comp;
+    }
+    return r;
+  }
+
   int Image::metadata_get(const std::string &key, std::string *value)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
@@ -1103,6 +1169,91 @@ extern "C" int rbd_image_options_is_empty(rbd_image_options_t opts)
   return librbd::image_options_is_empty(opts);
 }
 
+/* pool mirroring */
+extern "C" int rbd_mirror_is_enabled(rados_ioctx_t p, bool *enabled) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  int r = librbd::mirror_is_enabled(io_ctx, enabled);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+extern "C" int rbd_mirror_set_enabled(rados_ioctx_t p, bool enabled) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  return librbd::mirror_set_enabled(io_ctx, enabled);
+}
+
+extern "C" int rbd_mirror_peer_add(rados_ioctx_t p,
+                                   const char *cluster_uuid,
+                                   const char *cluster_name,
+                                   const char *client_name) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  return librbd::mirror_peer_add(io_ctx, cluster_uuid, cluster_name,
+                                 client_name);
+}
+
+extern "C" int rbd_mirror_peer_remove(rados_ioctx_t p,
+                                      const char *cluster_name) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  int r = librbd::mirror_peer_remove(io_ctx, cluster_name);
+  return r;
+}
+
+extern "C" int rbd_mirror_peer_list(rados_ioctx_t p,
+                                    rbd_mirror_peer_t *peers, int *max_peers) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+
+  std::vector<librbd::mirror_peer_t> peer_vector;
+  int r = librbd::mirror_peer_list(io_ctx, &peer_vector);
+  if (r < 0) {
+    return r;
+  }
+
+  if (*max_peers < static_cast<int>(peer_vector.size())) {
+    *max_peers = static_cast<int>(peer_vector.size());
+    return -ERANGE;
+  }
+
+  for (int i = 0; i < static_cast<int>(peer_vector.size()); ++i) {
+    peers[i].cluster_uuid = strdup(peer_vector[i].cluster_uuid.c_str());
+    peers[i].cluster_name = strdup(peer_vector[i].cluster_name.c_str());
+    peers[i].client_name = strdup(peer_vector[i].client_name.c_str());
+  }
+  *max_peers = static_cast<int>(peer_vector.size());
+  return 0;
+}
+
+extern "C" void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers,
+                                             int max_peers) {
+  for (int i = 0; i < max_peers; ++i) {
+    free(peers[i].cluster_uuid);
+    free(peers[i].cluster_name);
+    free(peers[i].client_name);
+  }
+}
+
+extern "C" int rbd_mirror_peer_set_client(rados_ioctx_t p,
+                                          const char *cluster_uuid,
+                                          const char *client_name) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  return librbd::mirror_peer_set_client(io_ctx, cluster_uuid, client_name);
+}
+
+extern "C" int rbd_mirror_peer_set_cluster(rados_ioctx_t p,
+                                           const char *cluster_uuid,
+                                           const char *cluster_name) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  return librbd::mirror_peer_set_cluster(io_ctx, cluster_uuid, cluster_name);
+}
+
 /* images */
 extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size)
 {
@@ -1393,9 +1544,13 @@ extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image,
   librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
 						false);
   tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
-  int r = librbd::open_image(ictx);
-  if (r >= 0)
+
+  int r = ictx->state->open();
+  if (r < 0) {
+    delete ictx;
+  } else {
     *image = (rbd_image_t)ictx;
+  }
   tracepoint(librbd, open_image_exit, r);
   return r;
 }
@@ -1409,18 +1564,24 @@ extern "C" int rbd_open_read_only(rados_ioctx_t p, const char *name,
   librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
 						true);
   tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
-  int r = librbd::open_image(ictx);
-  if (r >= 0)
+
+  int r = ictx->state->open();
+  if (r < 0) {
+    delete ictx;
+  } else {
     *image = (rbd_image_t)ictx;
+  }
   tracepoint(librbd, open_image_exit, r);
   return r;
 }
 
 extern "C" int rbd_close(rbd_image_t image)
 {
-  librbd::ImageCtx *ctx = (librbd::ImageCtx *)image;
-  tracepoint(librbd, close_image_enter, ctx, ctx->name.c_str(), ctx->id.c_str());
-  int r = librbd::close_image(ctx);
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
+
+  int r = ictx->state->close();
+
   tracepoint(librbd, close_image_exit, r);
   return r;
 }
@@ -1573,6 +1734,15 @@ extern "C" int rbd_get_flags(rbd_image_t image, uint64_t *flags)
   return r;
 }
 
+extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, set_image_notification_enter, ictx, fd, type);
+  int r = librbd::set_image_notification(ictx, fd, type);
+  tracepoint(librbd, set_image_notification_exit, ictx, r);
+  return r;
+}
+
 extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
@@ -1959,7 +2129,7 @@ extern "C" int rbd_diff_iterate(rbd_image_t image,
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
              ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
-             true, true);
+             true, false);
   int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, true, false, cb,
                                arg);
   tracepoint(librbd, diff_iterate_exit, r);
@@ -2112,6 +2282,20 @@ extern "C" int rbd_invalidate_cache(rbd_image_t image)
   return r;
 }
 
+extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  librbd::AioCompletion *cs[numcomp];
+  tracepoint(librbd, poll_io_events_enter, ictx, numcomp);
+  int r = librbd::poll_io_events(ictx, cs, numcomp);
+  tracepoint(librbd, poll_io_events_exit, r);
+  if (r > 0) {
+    for (int i = 0; i < r; ++i)
+      comps[i] = cs[i]->rbd_comp;
+  }
+  return r;
+}
+
 extern "C" int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *vallen)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
@@ -2206,6 +2390,12 @@ extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c)
   return comp->get_return_value();
 }
 
+extern "C" void *rbd_aio_get_arg(rbd_completion_t c)
+{
+  librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+  return comp->get_arg();
+}
+
 extern "C" void rbd_aio_release(rbd_completion_t c)
 {
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
diff --git a/src/librbd/object_map/InvalidateRequest.cc b/src/librbd/object_map/InvalidateRequest.cc
new file mode 100644
index 0000000..3566fdd
--- /dev/null
+++ b/src/librbd/object_map/InvalidateRequest.cc
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/InvalidateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::InvalidateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+template <typename I>
+InvalidateRequest<I>* InvalidateRequest<I>::create(I &image_ctx,
+                                                   uint64_t snap_id, bool force,
+                                                   Context *on_finish) {
+  return new InvalidateRequest<I>(image_ctx, snap_id, force, on_finish);
+}
+
+template <typename I>
+void InvalidateRequest<I>::send() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  assert(image_ctx.snap_lock.is_wlocked());
+
+  uint64_t snap_flags;
+  int r = image_ctx.get_flags(m_snap_id, &snap_flags);
+  if (r < 0 || ((snap_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0)) {
+    this->async_complete(r);
+    return;
+  }
+
+  CephContext *cct = image_ctx.cct;
+  lderr(cct) << this << " invalidating object map in-memory" << dendl;
+
+  // update in-memory flags
+  uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID;
+  if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+    flags |= RBD_FLAG_FAST_DIFF_INVALID;
+  }
+
+  r = image_ctx.update_flags(m_snap_id, flags, true);
+  if (r < 0) {
+    this->async_complete(r);
+  }
+
+  // do not update on-disk flags if not image owner
+  if (image_ctx.image_watcher == nullptr ||
+      (!m_force && m_snap_id == CEPH_NOSNAP &&
+       image_ctx.exclusive_lock != nullptr &&
+       !image_ctx.exclusive_lock->is_lock_owner())) {
+    this->async_complete(0);
+    return;
+  }
+
+  lderr(cct) << this << " invalidating object map on-disk" << dendl;
+  librados::ObjectWriteOperation op;
+  if (image_ctx.exclusive_lock != nullptr &&
+      m_snap_id == CEPH_NOSNAP && !m_force) {
+    image_ctx.exclusive_lock->assert_header_locked(&op);
+  }
+  cls_client::set_flags(&op, m_snap_id, flags, flags);
+
+  librados::AioCompletion *rados_completion =
+    this->create_callback_completion();
+  r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+                                     &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+bool InvalidateRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
+  return true;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/InvalidateRequest.h b/src/librbd/object_map/InvalidateRequest.h
new file mode 100644
index 0000000..b051379
--- /dev/null
+++ b/src/librbd/object_map/InvalidateRequest.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class InvalidateRequest : public AsyncRequest<ImageCtxT> {
+public:
+  static InvalidateRequest* create(ImageCtxT &image_ctx, uint64_t snap_id,
+                                   bool force, Context *on_finish);
+
+  InvalidateRequest(ImageCtxT &image_ctx, uint64_t snap_id, bool force,
+                    Context *on_finish)
+    : AsyncRequest<ImageCtxT>(image_ctx, on_finish),
+      m_snap_id(snap_id), m_force(force) {
+  }
+
+  virtual void send();
+
+protected:
+  virtual bool should_complete(int r) override;
+  virtual int filter_return_code(int r) const override{
+    // never propagate an error back to the caller
+    return 0;
+  }
+
+private:
+  uint64_t m_snap_id;
+  bool m_force;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::InvalidateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H
diff --git a/src/librbd/object_map/LockRequest.cc b/src/librbd/object_map/LockRequest.cc
new file mode 100644
index 0000000..3af5073
--- /dev/null
+++ b/src/librbd/object_map/LockRequest.cc
@@ -0,0 +1,154 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/LockRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::LockRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_ack_callback;
+using util::create_rados_safe_callback;
+
+template <typename I>
+LockRequest<I>::LockRequest(I &image_ctx, Context *on_finish)
+  : m_image_ctx(image_ctx), m_on_finish(on_finish), m_broke_lock(false) {
+}
+
+template <typename I>
+void LockRequest<I>::send() {
+  send_lock();
+}
+
+template <typename I>
+void LockRequest<I>::send_lock() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::lock(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "", "",
+                           utime_t(), 0);
+
+  using klass = LockRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_lock>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_lock(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val == 0) {
+    return m_on_finish;
+  } else if (m_broke_lock || *ret_val != -EBUSY) {
+    lderr(cct) << "failed to lock object map: " << cpp_strerror(*ret_val)
+               << dendl;
+    *ret_val = 0;
+    return m_on_finish;
+  }
+
+  send_get_lock_info();
+  return nullptr;
+}
+
+template <typename I>
+void LockRequest<I>::send_get_lock_info() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+  librados::ObjectReadOperation op;
+  rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME);
+
+  using klass = LockRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_ack_callback<klass, &klass::handle_get_lock_info>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_get_lock_info(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val == -ENOENT) {
+    send_lock();
+    return nullptr;
+  }
+
+  ClsLockType lock_type;
+  std::string lock_tag;
+  if (*ret_val == 0) {
+    bufferlist::iterator it = m_out_bl.begin();
+    *ret_val = rados::cls::lock::get_lock_info_finish(&it, &m_lockers,
+                                                      &lock_type, &lock_tag);
+  }
+  if (*ret_val < 0) {
+    lderr(cct) << "failed to list object map locks: " << cpp_strerror(*ret_val)
+               << dendl;
+    *ret_val = 0;
+    return m_on_finish;
+  }
+
+  send_break_locks();
+  return nullptr;
+}
+
+template <typename I>
+void LockRequest<I>::send_break_locks() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << ", "
+                 << "num_lockers=" << m_lockers.size() << dendl;
+
+  librados::ObjectWriteOperation op;
+  for (auto &locker : m_lockers) {
+    rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, locker.first.cookie,
+                                 locker.first.locker);
+  }
+
+  using klass = LockRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_break_locks>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *LockRequest<I>::handle_break_locks(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  m_broke_lock = true;
+  if (*ret_val == 0 || *ret_val == -ENOENT) {
+    send_lock();
+    return nullptr;
+  }
+
+  lderr(cct) << "failed to break object map lock: " << cpp_strerror(*ret_val)
+             << dendl;
+  *ret_val = 0;
+  return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::LockRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/LockRequest.h b/src/librbd/object_map/LockRequest.h
new file mode 100644
index 0000000..8f1ee6c
--- /dev/null
+++ b/src/librbd/object_map/LockRequest.h
@@ -0,0 +1,72 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "cls/lock/cls_lock_types.h"
+#include <map>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class LockRequest {
+public:
+  LockRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>         /------------------------------------- BREAK_LOCKS * * *
+   *    |            |                                        ^             *
+   *    |            |                                        |             *
+   *    |            |                                        |             *
+   *    |            v   (EBUSY && !broke_lock)               |             *
+   *    \---------> LOCK_OBJECT_MAP * * * * * * * * * * * > GET_LOCK_INFO * *
+   *                 |  *       ^                             *             *
+   *                 |  *       *                             *             *
+   *                 |  *       *  (ENOENT)                   *             *
+   *                 |  *       * * * * * * * * * * * * * * * *             *
+   *                 |  *                                                   *
+   *                 |  * (other errors)                                    *
+   *                 |  *                                                   *
+   *                 v  v                         (other errors)            *
+   *               <finish> < * * * * * * * * * * * * * * * * * * * * * * * *
+   *
+   * @endverbatim
+   */
+
+  ImageCtxT &m_image_ctx;
+  Context *m_on_finish;
+
+  bool m_broke_lock;
+  std::map<rados::cls::lock::locker_id_t,
+           rados::cls::lock::locker_info_t> m_lockers;
+  bufferlist m_out_bl;
+
+  void send_lock();
+  Context *handle_lock(int *ret_val);
+
+  void send_get_lock_info();
+  Context *handle_get_lock_info(int *ret_val);
+
+  void send_break_locks();
+  Context *handle_break_locks(int *ret_val);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::LockRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H
diff --git a/src/librbd/object_map/RefreshRequest.cc b/src/librbd/object_map/RefreshRequest.cc
new file mode 100644
index 0000000..84acba3
--- /dev/null
+++ b/src/librbd/object_map/RefreshRequest.cc
@@ -0,0 +1,224 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/RefreshRequest.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "librbd/Utils.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::RefreshRequest: "
+
+namespace librbd {
+
+using util::create_context_callback;
+using util::create_rados_ack_callback;
+using util::create_rados_safe_callback;
+
+namespace object_map {
+
+template <typename I>
+RefreshRequest<I>::RefreshRequest(I &image_ctx, ceph::BitVector<2> *object_map,
+                                  uint64_t snap_id, Context *on_finish)
+  : m_image_ctx(image_ctx), m_object_map(object_map), m_snap_id(snap_id),
+    m_on_finish(on_finish), m_object_count(0),
+    m_truncate_on_disk_object_map(false) {
+}
+
+template <typename I>
+void RefreshRequest<I>::send() {
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    m_object_count = Striper::get_num_objects(
+      m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id));
+  }
+
+  send_load();
+}
+
+template <typename I>
+void RefreshRequest<I>::apply() {
+  uint64_t num_objs;
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    num_objs = Striper::get_num_objects(
+      m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id));
+  }
+  assert(m_on_disk_object_map.size() >= num_objs);
+
+  *m_object_map = m_on_disk_object_map;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_load() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::object_map_load_start(&op);
+
+  using klass = RefreshRequest<I>;
+  m_out_bl.clear();
+  librados::AioCompletion *rados_completion =
+    create_rados_ack_callback<klass, &klass::handle_load>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_load(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val == 0) {
+    bufferlist::iterator bl_it = m_out_bl.begin();
+    *ret_val = cls_client::object_map_load_finish(&bl_it,
+                                                  &m_on_disk_object_map);
+  }
+
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  if (*ret_val == -EINVAL) {
+     // object map is corrupt on-disk -- clear it and properly size it
+     // so future IO can keep the object map in sync
+    lderr(cct) << "object map corrupt on-disk: " << oid << dendl;
+    m_truncate_on_disk_object_map = true;
+    send_resize_invalidate();
+    return nullptr;
+  } else if (*ret_val < 0) {
+    lderr(cct) << "failed to load object map: " << oid << dendl;
+    send_invalidate();
+    return nullptr;
+  }
+
+  if (m_on_disk_object_map.size() < m_object_count) {
+    lderr(cct) << "object map smaller than current object count: "
+               << m_on_disk_object_map.size() << " != "
+               << m_object_count << dendl;
+    send_resize_invalidate();
+    return nullptr;
+  }
+
+  ldout(cct, 20) << "refreshed object map: num_objs="
+                 << m_on_disk_object_map.size() << dendl;
+  if (m_on_disk_object_map.size() > m_object_count) {
+    // resize op might have been interrupted
+    ldout(cct, 1) << "object map larger than current object count: "
+                  << m_on_disk_object_map.size() << " != "
+                  << m_object_count << dendl;
+  }
+
+  apply();
+  return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_invalidate() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_on_disk_object_map.clear();
+  object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count,
+                                    OBJECT_EXISTS);
+
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_invalidate>(this);
+  InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+    m_image_ctx, m_snap_id, false, ctx);
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_invalidate(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  assert(*ret_val == 0);
+  apply();
+  return m_on_finish;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_resize_invalidate() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
+
+  m_on_disk_object_map.clear();
+  object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count,
+                                    OBJECT_EXISTS);
+
+  using klass = RefreshRequest<I>;
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_resize_invalidate>(this);
+  InvalidateRequest<I> *req = InvalidateRequest<I>::create(
+    m_image_ctx, m_snap_id, false, ctx);
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  req->send();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_resize_invalidate(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  assert(*ret_val == 0);
+  send_resize();
+  return nullptr;
+}
+
+template <typename I>
+void RefreshRequest<I>::send_resize() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+  librados::ObjectWriteOperation op;
+  if (m_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
+  if (m_truncate_on_disk_object_map) {
+    op.truncate(0);
+  }
+  cls_client::object_map_resize(&op, m_object_count, OBJECT_NONEXISTENT);
+
+  using klass = RefreshRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_resize>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *RefreshRequest<I>::handle_resize(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0) {
+    lderr(cct) << "failed to adjust object map size: " << cpp_strerror(*ret_val)
+               << dendl;
+    *ret_val = 0;
+  }
+  apply();
+  return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::RefreshRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/RefreshRequest.h b/src/librbd/object_map/RefreshRequest.h
new file mode 100644
index 0000000..17a69a0
--- /dev/null
+++ b/src/librbd/object_map/RefreshRequest.h
@@ -0,0 +1,76 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/bit_vector.hpp"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class RefreshRequest {
+public:
+  RefreshRequest(ImageCtxT &image_ctx, ceph::BitVector<2> *object_map,
+                 uint64_t snap_id, Context *on_finish);
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *                         (other errors)
+   * <start> -----> LOAD * * * * * * * > INVALIDATE ------------\
+   *                  |    *                                    |
+   *                  |    * (-EINVAL or too small)             |
+   *                  |    * * * * * * > INVALIDATE_AND_RESIZE  |
+   *                  |                      |              *   |
+   *                  |                      |              *   |
+   *                  |                      v              *   |
+   *                  |                    RESIZE           *   |
+   *                  |                      |              *   |
+   *                  |                      |  * * * * * * *   |
+   *                  |                      |  *               |
+   *                  |                      v  v               |
+   *                  \-----------------> <finish> <------------/
+   * @endverbatim
+   */
+
+  ImageCtxT &m_image_ctx;
+  ceph::BitVector<2> *m_object_map;
+  uint64_t m_snap_id;
+  Context *m_on_finish;
+
+  uint64_t m_object_count;
+  ceph::BitVector<2> m_on_disk_object_map;
+  bool m_truncate_on_disk_object_map;
+  bufferlist m_out_bl;
+
+  void send_load();
+  Context *handle_load(int *ret_val);
+
+  void send_invalidate();
+  Context *handle_invalidate(int *ret_val);
+
+  void send_resize_invalidate();
+  Context *handle_resize_invalidate(int *ret_val);
+
+  void send_resize();
+  Context *handle_resize(int *ret_val);
+
+  void apply();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::RefreshRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H
diff --git a/src/librbd/object_map/Request.cc b/src/librbd/object_map/Request.cc
new file mode 100644
index 0000000..8a731e1
--- /dev/null
+++ b/src/librbd/object_map/Request.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/Request.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/RWLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/object_map/InvalidateRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::Request: "
+
+namespace librbd {
+namespace object_map {
+
+bool Request::should_complete(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << &m_image_ctx << " should_complete: r=" << r << dendl;
+
+  switch (m_state)
+  {
+  case STATE_REQUEST:
+    if (r < 0) {
+      lderr(cct) << "failed to update object map: " << cpp_strerror(r)
+		 << dendl;
+      return invalidate();
+    }
+
+    {
+      RWLock::WLocker l2(m_image_ctx.object_map_lock);
+      finish();
+    }
+    return true;
+
+  case STATE_INVALIDATE:
+    ldout(cct, 20) << "INVALIDATE" << dendl;
+    if (r < 0) {
+      lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r)
+		 << dendl;
+    }
+    return true;
+
+  default:
+    lderr(cct) << "invalid state: " << m_state << dendl;
+    assert(false);
+    break;
+  }
+  return false;
+}
+
+bool Request::invalidate() {
+  if (m_image_ctx.test_flags(RBD_FLAG_OBJECT_MAP_INVALID)) {
+    return true;
+  }
+
+  m_state = STATE_INVALIDATE;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id,
+                                                     true,
+                                                     create_callback_context());
+  req->send();
+  return false;
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/Request.h b/src/librbd/object_map/Request.h
new file mode 100644
index 0000000..fdc22de
--- /dev/null
+++ b/src/librbd/object_map/Request.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class Request : public AsyncRequest<> {
+public:
+  Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish)
+    : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id),
+      m_state(STATE_REQUEST)
+  {
+  }
+
+  virtual void send() = 0;
+
+protected:
+  const uint64_t m_snap_id;
+
+  virtual bool should_complete(int r);
+  virtual int filter_return_code(int r) const {
+    // never propagate an error back to the caller
+    return 0;
+  }
+  virtual void finish() = 0;
+
+private:
+  /**
+   * <start> ---> STATE_REQUEST ---> <finish>
+   *                   |                ^
+   *                   v                |
+   *            STATE_INVALIDATE -------/
+   */
+  enum State {
+    STATE_REQUEST,
+    STATE_INVALIDATE
+  };
+
+  State m_state;
+
+  bool invalidate();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/object_map/ResizeRequest.cc b/src/librbd/object_map/ResizeRequest.cc
new file mode 100644
index 0000000..afbde42
--- /dev/null
+++ b/src/librbd/object_map/ResizeRequest.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/ResizeRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::ResizeRequest: "
+
+namespace librbd {
+namespace object_map {
+
+void ResizeRequest::resize(ceph::BitVector<2> *object_map, uint64_t num_objs,
+                           uint8_t default_state) {
+  size_t orig_object_map_size = object_map->size();
+  object_map->resize(num_objs);
+  for (uint64_t i = orig_object_map_size; i < object_map->size(); ++i) {
+    (*object_map)[i] = default_state;
+  }
+}
+
+void ResizeRequest::send() {
+  CephContext *cct = m_image_ctx.cct;
+
+  RWLock::WLocker l(m_image_ctx.object_map_lock);
+  m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size);
+
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 5) << &m_image_ctx << " resizing on-disk object map: "
+                << "oid=" << oid << ", num_objs=" << m_num_objs << dendl;
+
+  librados::ObjectWriteOperation op;
+  if (m_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
+  cls_client::object_map_resize(&op, m_num_objs, m_default_object_state);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void ResizeRequest::finish() {
+  CephContext *cct = m_image_ctx.cct;
+
+  ldout(cct, 5) << &m_image_ctx << " resizing in-memory object map: "
+		<< m_num_objs << dendl;
+  resize(m_object_map, m_num_objs, m_default_object_state);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/ResizeRequest.h b/src/librbd/object_map/ResizeRequest.h
new file mode 100644
index 0000000..ca95393
--- /dev/null
+++ b/src/librbd/object_map/ResizeRequest.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+#include "common/bit_vector.hpp"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class ResizeRequest : public Request {
+public:
+  ResizeRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+                uint64_t snap_id, uint64_t new_size,
+      	  uint8_t default_object_state, Context *on_finish)
+    : Request(image_ctx, snap_id, on_finish), m_object_map(object_map),
+      m_num_objs(0), m_new_size(new_size),
+      m_default_object_state(default_object_state)
+  {
+  }
+
+  static void resize(ceph::BitVector<2> *object_map, uint64_t num_objs,
+                     uint8_t default_state);
+
+  virtual void send();
+
+protected:
+  virtual void finish();
+
+private:
+  ceph::BitVector<2> *m_object_map;
+  uint64_t m_num_objs;
+  uint64_t m_new_size;
+  uint8_t m_default_object_state;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotCreateRequest.cc b/src/librbd/object_map/SnapshotCreateRequest.cc
new file mode 100644
index 0000000..abca0e2
--- /dev/null
+++ b/src/librbd/object_map/SnapshotCreateRequest.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+#include <iostream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotCreateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+                         const SnapshotCreateRequest::State& state) {
+  switch(state) {
+  case SnapshotCreateRequest::STATE_READ_MAP:
+    os << "READ_MAP";
+    break;
+  case SnapshotCreateRequest::STATE_WRITE_MAP:
+    os << "WRITE_MAP";
+    break;
+  case SnapshotCreateRequest::STATE_ADD_SNAPSHOT:
+    os << "ADD_SNAPSHOT";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+void SnapshotCreateRequest::send() {
+  send_read_map();
+}
+
+bool SnapshotCreateRequest::should_complete(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  if (r < 0 && m_ret_val == 0) {
+    m_ret_val = r;
+  }
+  if (m_ret_val < 0) {
+    // pass errors down to base class to invalidate the object map
+    return Request::should_complete(r);
+  }
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_READ_MAP:
+    send_write_map();
+    break;
+  case STATE_WRITE_MAP:
+    finished = send_add_snapshot();
+    break;
+  case STATE_ADD_SNAPSHOT:
+    update_object_map();
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+  return finished;
+}
+
+void SnapshotCreateRequest::send_read_map() {
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.get_snap_info(m_snap_id) != NULL);
+
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+  m_state = STATE_READ_MAP;
+
+  // IO is blocked due to the snapshot creation -- consistent to read from disk
+  librados::ObjectReadOperation op;
+  op.read(0, 0, NULL, NULL);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op,
+                                         &m_read_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void SnapshotCreateRequest::send_write_map() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string snap_oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+                << dendl;
+  m_state = STATE_WRITE_MAP;
+
+  librados::ObjectWriteOperation op;
+  op.write_full(m_read_bl);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+bool SnapshotCreateRequest::send_add_snapshot() {
+  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) == 0) {
+    return true;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+  m_state = STATE_ADD_SNAPSHOT;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  cls_client::object_map_snap_add(&op);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+  return false;
+}
+
+void SnapshotCreateRequest::update_object_map() {
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+
+  for (uint64_t i = 0; i < m_object_map.size(); ++i) {
+    if (m_object_map[i] == OBJECT_EXISTS) {
+      m_object_map[i] = OBJECT_EXISTS_CLEAN;
+    }
+  }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotCreateRequest.h b/src/librbd/object_map/SnapshotCreateRequest.h
new file mode 100644
index 0000000..f814332
--- /dev/null
+++ b/src/librbd/object_map/SnapshotCreateRequest.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+#include "librbd/object_map/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class SnapshotCreateRequest : public Request {
+public:
+  /**
+   * Snapshot create goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * STATE_READ_MAP
+   *    |
+   *    v            (skip)
+   * STATE_WRITE_MAP . . . . . . .
+   *    |                        .
+   *    v                        v
+   * STATE_ADD_SNAPSHOT ---> <finish>
+   *
+   * @endverbatim
+   *
+   * The _ADD_SNAPSHOT state is skipped if the FAST_DIFF feature isn't enabled.
+   */
+  enum State {
+    STATE_READ_MAP,
+    STATE_WRITE_MAP,
+    STATE_ADD_SNAPSHOT
+  };
+
+  SnapshotCreateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+                        uint64_t snap_id, Context *on_finish)
+    : Request(image_ctx, snap_id, on_finish),
+      m_object_map(*object_map), m_ret_val(0) {
+  }
+
+  virtual void send();
+
+protected:
+  virtual bool should_complete(int r);
+
+  virtual void finish() {
+  }
+
+private:
+  State m_state;
+  ceph::BitVector<2> &m_object_map;
+
+  bufferlist m_read_bl;
+  int m_ret_val;
+
+  void send_read_map();
+  void send_write_map();
+  bool send_add_snapshot();
+
+  void update_object_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotRemoveRequest.cc b/src/librbd/object_map/SnapshotRemoveRequest.cc
new file mode 100644
index 0000000..c718af1
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRemoveRequest.cc
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotRemoveRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+                         const SnapshotRemoveRequest::State& state) {
+  switch(state) {
+  case SnapshotRemoveRequest::STATE_LOAD_MAP:
+    os << "LOAD_MAP";
+    break;
+  case SnapshotRemoveRequest::STATE_REMOVE_SNAPSHOT:
+    os << "REMOVE_SNAPSHOT";
+    break;
+  case SnapshotRemoveRequest::STATE_INVALIDATE_NEXT_MAP:
+    os << "INVALIDATE_NEXT_MAP";
+    break;
+  case SnapshotRemoveRequest::STATE_REMOVE_MAP:
+    os << "REMOVE_MAP";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+void SnapshotRemoveRequest::send() {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.snap_lock.is_wlocked());
+
+  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+    compute_next_snap_id();
+
+    uint64_t flags;
+    int r = m_image_ctx.get_flags(m_snap_id, &flags);
+    assert(r == 0);
+
+    if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
+      send_invalidate_next_map();
+    } else {
+      send_load_map();
+    }
+  } else {
+    send_remove_map();
+  }
+}
+
+bool SnapshotRemoveRequest::should_complete(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_LOAD_MAP:
+    if (r == 0) {
+      bufferlist::iterator it = m_out_bl.begin();
+      r = cls_client::object_map_load_finish(&it, &m_snap_object_map);
+    }
+    if (r < 0) {
+      RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+      send_invalidate_next_map();
+    } else {
+      send_remove_snapshot();
+    }
+    break;
+  case STATE_REMOVE_SNAPSHOT:
+    if (r < 0 && r != -ENOENT) {
+      RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+      send_invalidate_next_map();
+    } else {
+      update_object_map();
+      send_remove_map();
+    }
+    break;
+  case STATE_INVALIDATE_NEXT_MAP:
+    send_remove_map();
+    break;
+  case STATE_REMOVE_MAP:
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+  return finished;
+}
+
+void SnapshotRemoveRequest::send_load_map() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string snap_oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+                << dendl;
+  m_state = STATE_LOAD_MAP;
+
+  librados::ObjectReadOperation op;
+  cls_client::object_map_load_start(&op);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op,
+                                         &m_out_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void SnapshotRemoveRequest::send_remove_snapshot() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_next_snap_id));
+  ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+  m_state = STATE_REMOVE_SNAPSHOT;
+
+  librados::ObjectWriteOperation op;
+  if (m_next_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
+  cls_client::object_map_snap_remove(&op, m_snap_object_map);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void SnapshotRemoveRequest::send_invalidate_next_map() {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_image_ctx.snap_lock.is_wlocked());
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_INVALIDATE_NEXT_MAP;
+
+  InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx,
+                                                     m_next_snap_id, true,
+                                                     create_callback_context());
+  req->send();
+}
+
+void SnapshotRemoveRequest::send_remove_map() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl;
+  m_state = STATE_REMOVE_MAP;
+
+  librados::ObjectWriteOperation op;
+  op.remove();
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void SnapshotRemoveRequest::compute_next_snap_id() {
+  assert(m_image_ctx.snap_lock.is_locked());
+
+  m_next_snap_id = CEPH_NOSNAP;
+  std::map<librados::snap_t, SnapInfo>::const_iterator it =
+    m_image_ctx.snap_info.find(m_snap_id);
+  assert(it != m_image_ctx.snap_info.end());
+
+  ++it;
+  if (it != m_image_ctx.snap_info.end()) {
+    m_next_snap_id = it->first;
+  }
+}
+
+void SnapshotRemoveRequest::update_object_map() {
+  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+  RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+  if (m_next_snap_id == m_image_ctx.snap_id && m_next_snap_id == CEPH_NOSNAP) {
+    CephContext *cct = m_image_ctx.cct;
+    ldout(cct, 5) << this << " " << __func__ << dendl;
+
+    for (uint64_t i = 0; i < m_object_map.size(); ++i) {
+      if (m_object_map[i] == OBJECT_EXISTS_CLEAN &&
+          (i >= m_snap_object_map.size() ||
+           m_snap_object_map[i] == OBJECT_EXISTS)) {
+        m_object_map[i] = OBJECT_EXISTS;
+      }
+    }
+  }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotRemoveRequest.h b/src/librbd/object_map/SnapshotRemoveRequest.h
new file mode 100644
index 0000000..75fbdc8
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRemoveRequest.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/bit_vector.hpp"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+namespace object_map {
+
+class SnapshotRemoveRequest : public AsyncRequest<> {
+public:
+  /**
+   * Snapshot rollback goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start> -----------> STATE_LOAD_MAP ----\
+   *    .                      *             |
+   *    .                      * (error)     |
+   *    . (invalid object map) v             |
+   *    . . . > STATE_INVALIDATE_NEXT_MAP    |
+   *    .                      |             |
+   *    .                      |             |
+   *    . (fast diff disabled) v             v
+   *    . . . . . . . . . . > STATE_REMOVE_MAP
+   *                                 |
+   *                                 v
+   *                             <finish>
+   *
+   * @endverbatim
+   *
+   * The _LOAD_MAP state is skipped if the fast diff feature is disabled.
+   * If the fast diff feature is enabled and the snapshot is flagged as
+   * invalid, the next snapshot / HEAD object mapis flagged as invalid;
+   * otherwise, the state machine proceeds to remove the object map.
+   */
+  enum State {
+    STATE_LOAD_MAP,
+    STATE_REMOVE_SNAPSHOT,
+    STATE_INVALIDATE_NEXT_MAP,
+    STATE_REMOVE_MAP
+  };
+
+  SnapshotRemoveRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+                        uint64_t snap_id, Context *on_finish)
+    : AsyncRequest(image_ctx, on_finish), m_object_map(*object_map),
+      m_snap_id(snap_id), m_next_snap_id(CEPH_NOSNAP) {
+  }
+
+  virtual void send();
+
+protected:
+  virtual bool should_complete(int r);
+
+  virtual int filter_return_code(int r) const {
+    if (m_state == STATE_REMOVE_MAP && r == -ENOENT) {
+      return 0;
+    }
+    return r;
+  }
+
+  virtual void finish() {
+  }
+
+private:
+  State m_state;
+  ceph::BitVector<2> &m_object_map;
+  uint64_t m_snap_id;
+  uint64_t m_next_snap_id;
+
+  ceph::BitVector<2> m_snap_object_map;
+  bufferlist m_out_bl;
+
+  void send_load_map();
+  void send_remove_snapshot();
+  void send_invalidate_next_map();
+  void send_remove_map();
+
+  void compute_next_snap_id();
+  void update_object_map();
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H
diff --git a/src/librbd/object_map/SnapshotRollbackRequest.cc b/src/librbd/object_map/SnapshotRollbackRequest.cc
new file mode 100644
index 0000000..9d4fc4a
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRollbackRequest.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include <iostream>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::SnapshotRollbackRequest: "
+
+namespace librbd {
+namespace object_map {
+
+namespace {
+
+std::ostream& operator<<(std::ostream& os,
+                         const SnapshotRollbackRequest::State& state) {
+  switch(state) {
+  case SnapshotRollbackRequest::STATE_READ_MAP:
+    os << "READ_MAP";
+    break;
+  case SnapshotRollbackRequest::STATE_INVALIDATE_MAP:
+    os << "INVALIDATE_MAP";
+    break;
+  case SnapshotRollbackRequest::STATE_WRITE_MAP:
+    os << "WRITE_MAP";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+void SnapshotRollbackRequest::send() {
+  send_read_map();
+}
+
+bool SnapshotRollbackRequest::should_complete(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  if (r < 0 && m_ret_val == 0) {
+    m_ret_val = r;
+  }
+
+  bool finished = false;
+  switch (m_state) {
+  case STATE_READ_MAP:
+    if (r < 0) {
+      // invalidate the snapshot object map
+      send_invalidate_map();
+    } else {
+      send_write_map();
+    }
+    break;
+  case STATE_INVALIDATE_MAP:
+    // invalidate the HEAD object map as well
+    finished = Request::should_complete(m_ret_val);
+    break;
+  case STATE_WRITE_MAP:
+    finished = Request::should_complete(r);
+    break;
+  default:
+    assert(false);
+    break;
+  }
+  return finished;
+}
+
+void SnapshotRollbackRequest::send_read_map() {
+  std::string snap_oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+                << dendl;
+  m_state = STATE_READ_MAP;
+
+  librados::ObjectReadOperation op;
+  op.read(0, 0, NULL, NULL);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op,
+                                         &m_read_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void SnapshotRollbackRequest::send_write_map() {
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+
+  CephContext *cct = m_image_ctx.cct;
+  std::string snap_oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid
+                << dendl;
+  m_state = STATE_WRITE_MAP;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  op.write_full(m_read_bl);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void SnapshotRollbackRequest::send_invalidate_map() {
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_INVALIDATE_MAP;
+
+  InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id,
+                                                     false,
+                                                     create_callback_context());
+  req->send();
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/SnapshotRollbackRequest.h b/src/librbd/object_map/SnapshotRollbackRequest.h
new file mode 100644
index 0000000..4717166
--- /dev/null
+++ b/src/librbd/object_map/SnapshotRollbackRequest.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class SnapshotRollbackRequest : public Request {
+public:
+  /**
+   * Snapshot rollback goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v           (error)
+   * STATE_READ_MAP * * * * > STATE_INVALIDATE_MAP
+   *    |                         |
+   *    v                         v
+   * STATE_WRITE_MAP -------> <finish>
+   *
+   * @endverbatim
+   *
+   * If an error occurs within the READ_MAP state, the associated snapshot's
+   * object map will be flagged as invalid.  Otherwise, an error from any state
+   * will result in the HEAD object map being flagged as invalid via the base
+   * class.
+   */
+  enum State {
+    STATE_READ_MAP,
+    STATE_INVALIDATE_MAP,
+    STATE_WRITE_MAP
+  };
+
+  SnapshotRollbackRequest(ImageCtx &image_ctx, uint64_t snap_id,
+                          Context *on_finish)
+    : Request(image_ctx, CEPH_NOSNAP, on_finish),
+      m_snap_id(snap_id), m_ret_val(0) {
+    assert(snap_id != CEPH_NOSNAP);
+  }
+
+  virtual void send();
+
+protected:
+  virtual bool should_complete(int r);
+
+  virtual void finish() {
+  }
+
+private:
+  State m_state;
+  uint64_t m_snap_id;
+  int m_ret_val;
+
+  bufferlist m_read_bl;
+
+  void send_read_map();
+  void send_invalidate_map();
+  void send_write_map();
+
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H
diff --git a/src/librbd/object_map/UnlockRequest.cc b/src/librbd/object_map/UnlockRequest.cc
new file mode 100644
index 0000000..c7ae980
--- /dev/null
+++ b/src/librbd/object_map/UnlockRequest.cc
@@ -0,0 +1,66 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/UnlockRequest.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::UnlockRequest: "
+
+namespace librbd {
+namespace object_map {
+
+using util::create_rados_safe_callback;
+
+template <typename I>
+UnlockRequest<I>::UnlockRequest(I &image_ctx, Context *on_finish)
+  : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void UnlockRequest<I>::send() {
+  send_unlock();
+}
+
+template <typename I>
+void UnlockRequest<I>::send_unlock() {
+  CephContext *cct = m_image_ctx.cct;
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl;
+
+  librados::ObjectWriteOperation op;
+  rados::cls::lock::unlock(&op, RBD_LOCK_NAME, "");
+
+  using klass = UnlockRequest<I>;
+  librados::AioCompletion *rados_completion =
+    create_rados_safe_callback<klass, &klass::handle_unlock>(this);
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+Context *UnlockRequest<I>::handle_unlock(int *ret_val) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl;
+
+  if (*ret_val < 0 && *ret_val != -ENOENT) {
+    lderr(m_image_ctx.cct) << "failed to release object map lock: "
+                           << cpp_strerror(*ret_val) << dendl;
+
+  }
+
+  *ret_val = 0;
+  return m_on_finish;
+}
+
+} // namespace object_map
+} // namespace librbd
+
+template class librbd::object_map::UnlockRequest<librbd::ImageCtx>;
diff --git a/src/librbd/object_map/UnlockRequest.h b/src/librbd/object_map/UnlockRequest.h
new file mode 100644
index 0000000..1453540
--- /dev/null
+++ b/src/librbd/object_map/UnlockRequest.h
@@ -0,0 +1,46 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include <map>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+template <typename ImageCtxT = ImageCtx>
+class UnlockRequest {
+public:
+  UnlockRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start> ----> UNLOCK ----> <finish>
+   *
+   * @endverbatim
+   */
+
+  ImageCtxT &m_image_ctx;
+  Context *m_on_finish;
+
+  void send_unlock();
+  Context* handle_unlock(int *ret_val);
+};
+
+} // namespace object_map
+} // namespace librbd
+
+extern template class librbd::object_map::UnlockRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H
diff --git a/src/librbd/object_map/UpdateRequest.cc b/src/librbd/object_map/UpdateRequest.cc
new file mode 100644
index 0000000..fc651ab
--- /dev/null
+++ b/src/librbd/object_map/UpdateRequest.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/UpdateRequest.h"
+#include "include/rbd/object_map_types.h"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "cls/lock/cls_lock_client.h"
+#include <string>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: "
+
+namespace librbd {
+namespace object_map {
+
+void UpdateRequest::send() {
+  assert(m_image_ctx.snap_lock.is_locked());
+  assert(m_image_ctx.object_map_lock.is_locked());
+  CephContext *cct = m_image_ctx.cct;
+
+  // safe to update in-memory state first without handling rollback since any
+  // failures will invalidate the object map
+  std::string oid(ObjectMap::object_map_name(m_image_ctx.id, m_snap_id));
+  ldout(cct, 20) << &m_image_ctx << " updating object map"
+                 << ": oid=" << oid << ", ["
+		 << m_start_object_no << "," << m_end_object_no << ") = "
+		 << (m_current_state ?
+		       stringify(static_cast<uint32_t>(*m_current_state)) : "")
+		 << "->" << static_cast<uint32_t>(m_new_state)
+		 << dendl;
+
+  // rebuilding the object map might update on-disk only
+  if (m_snap_id == m_image_ctx.snap_id) {
+    assert(m_image_ctx.object_map_lock.is_wlocked());
+    for (uint64_t object_no = m_start_object_no;
+         object_no < MIN(m_end_object_no, m_object_map.size());
+         ++object_no) {
+      uint8_t state = m_object_map[object_no];
+      if (!m_current_state || state == *m_current_state ||
+          (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
+        m_object_map[object_no] = m_new_state;
+      }
+    }
+  }
+
+  librados::ObjectWriteOperation op;
+  if (m_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
+  cls_client::object_map_update(&op, m_start_object_no, m_end_object_no,
+				m_new_state, m_current_state);
+
+  librados::AioCompletion *rados_completion = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void UpdateRequest::finish() {
+  ldout(m_image_ctx.cct, 20) << &m_image_ctx << " on-disk object map updated"
+                             << dendl;
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/librbd/object_map/UpdateRequest.h b/src/librbd/object_map/UpdateRequest.h
new file mode 100644
index 0000000..6c277b0
--- /dev/null
+++ b/src/librbd/object_map/UpdateRequest.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
+#define CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/object_map/Request.h"
+#include "common/bit_vector.hpp"
+#include <boost/optional.hpp>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace object_map {
+
+class UpdateRequest : public Request {
+public:
+  UpdateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
+                uint64_t snap_id, uint64_t start_object_no,
+                uint64_t end_object_no, uint8_t new_state,
+                const boost::optional<uint8_t> &current_state,
+      	  Context *on_finish)
+    : Request(image_ctx, snap_id, on_finish), m_object_map(*object_map),
+      m_start_object_no(start_object_no), m_end_object_no(end_object_no),
+      m_new_state(new_state), m_current_state(current_state)
+  {
+  }
+
+  virtual void send();
+
+protected:
+  virtual void finish();
+
+private:
+  ceph::BitVector<2> &m_object_map;
+  uint64_t m_start_object_no;
+  uint64_t m_end_object_no;
+  uint8_t m_new_state;
+  boost::optional<uint8_t> m_current_state;
+};
+
+} // namespace object_map
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H
diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc
similarity index 50%
rename from src/librbd/AsyncFlattenRequest.cc
rename to src/librbd/operation/FlattenRequest.cc
index dbcf334..cc4842e 100644
--- a/src/librbd/AsyncFlattenRequest.cc
+++ b/src/librbd/operation/FlattenRequest.cc
@@ -1,9 +1,10 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
-#include "librbd/AsyncFlattenRequest.h"
+#include "librbd/operation/FlattenRequest.h"
 #include "librbd/AioObjectRequest.h"
 #include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/ObjectMap.h"
@@ -14,33 +15,35 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::AsyncFlattenRequest: "
+#define dout_prefix *_dout << "librbd::FlattenRequest: "
 
 namespace librbd {
+namespace operation {
 
-class AsyncFlattenObjectContext : public C_AsyncObjectThrottle<> {
+template <typename I>
+class C_FlattenObject : public C_AsyncObjectThrottle<I> {
 public:
-  AsyncFlattenObjectContext(AsyncObjectThrottle<> &throttle,
-                            ImageCtx *image_ctx, uint64_t object_size,
-                            ::SnapContext snapc, uint64_t object_no)
-    : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_size(object_size),
+  C_FlattenObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+                  uint64_t object_size, ::SnapContext snapc, uint64_t object_no)
+    : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_size(object_size),
       m_snapc(snapc), m_object_no(object_no)
   {
   }
 
   virtual int send() {
-    assert(m_image_ctx.owner_lock.is_locked());
-    CephContext *cct = m_image_ctx.cct;
+    I &image_ctx = this->m_image_ctx;
+    assert(image_ctx.owner_lock.is_locked());
+    CephContext *cct = image_ctx.cct;
 
-    if (m_image_ctx.image_watcher->is_lock_supported() &&
-        !m_image_ctx.image_watcher->is_lock_owner()) {
+    if (image_ctx.exclusive_lock != nullptr &&
+        !image_ctx.exclusive_lock->is_lock_owner()) {
       ldout(cct, 1) << "lost exclusive lock during flatten" << dendl;
       return -ERESTART;
     }
 
     bufferlist bl;
-    string oid = m_image_ctx.get_object_name(m_object_no);
-    AioObjectWrite *req = new AioObjectWrite(&m_image_ctx, oid, m_object_no, 0,
+    string oid = image_ctx.get_object_name(m_object_no);
+    AioObjectWrite *req = new AioObjectWrite(&image_ctx, oid, m_object_no, 0,
                                              bl, m_snapc, this);
     if (!req->has_parent()) {
       // stop early if the parent went away - it just means
@@ -59,14 +62,17 @@ private:
   uint64_t m_object_no;
 };
 
-bool AsyncFlattenRequest::should_complete(int r) {
-  CephContext *cct = m_image_ctx.cct;
+template <typename I>
+bool FlattenRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
   ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
   if (r < 0 && !(r == -ENOENT && m_ignore_enoent) ) {
     lderr(cct) << "flatten encountered an error: " << cpp_strerror(r) << dendl;
     return true;
   }
 
+  RWLock::RLocker owner_locker(image_ctx.owner_lock);
   switch (m_state) {
   case STATE_FLATTEN_OBJECTS:
     ldout(cct, 5) << "FLATTEN_OBJECTS" << dendl;
@@ -88,75 +94,80 @@ bool AsyncFlattenRequest::should_complete(int r) {
   return false;
 }
 
-void AsyncFlattenRequest::send() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  CephContext *cct = m_image_ctx.cct;
+template <typename I>
+void FlattenRequest<I>::send_op() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  CephContext *cct = image_ctx.cct;
   ldout(cct, 5) << this << " send" << dendl;
 
   m_state = STATE_FLATTEN_OBJECTS;
-  AsyncObjectThrottle<>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<AsyncFlattenObjectContext>(),
-      boost::lambda::_1, &m_image_ctx, m_object_size, m_snapc,
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_FlattenObject<I> >(),
+      boost::lambda::_1, &image_ctx, m_object_size, m_snapc,
       boost::lambda::_2));
-  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
-    this, m_image_ctx, context_factory, create_callback_context(), &m_prog_ctx,
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, this->create_callback_context(), &m_prog_ctx,
     0, m_overlap_objects);
-  throttle->start_ops(m_image_ctx.concurrent_management_ops);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
 }
 
-bool AsyncFlattenRequest::send_update_header() {
-  assert(m_image_ctx.owner_lock.is_locked());
-  CephContext *cct = m_image_ctx.cct;
+template <typename I>
+bool FlattenRequest<I>::send_update_header() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  CephContext *cct = image_ctx.cct;
 
   ldout(cct, 5) << this << " send_update_header" << dendl;
   m_state = STATE_UPDATE_HEADER;
 
   // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
 
   {
-    RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+    RWLock::RLocker parent_locker(image_ctx.parent_lock);
     // stop early if the parent went away - it just means
     // another flatten finished first, so this one is useless.
-    if (!m_image_ctx.parent) {
+    if (!image_ctx.parent) {
       ldout(cct, 5) << "image already flattened" << dendl;
       return true;
     }
-    m_parent_spec = m_image_ctx.parent_md.spec;
+    m_parent_spec = image_ctx.parent_md.spec;
   }
   m_ignore_enoent = true;
 
   // remove parent from this (base) image
   librados::ObjectWriteOperation op;
-  if (m_image_ctx.image_watcher->is_lock_supported()) {
-    m_image_ctx.image_watcher->assert_header_locked(&op);
+  if (image_ctx.exclusive_lock != nullptr) {
+    image_ctx.exclusive_lock->assert_header_locked(&op);
   }
   cls_client::remove_parent(&op);
 
-  librados::AioCompletion *rados_completion = create_callback_completion();
-  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
         				 rados_completion, &op);
   assert(r == 0);
   rados_completion->release();
   return false;
 }
 
-bool AsyncFlattenRequest::send_update_children() {
-  CephContext *cct = m_image_ctx.cct;
-
-  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+template <typename I>
+bool FlattenRequest<I>::send_update_children() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  CephContext *cct = image_ctx.cct;
 
   // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
 
   // if there are no snaps, remove from the children object as well
   // (if snapshots remain, they have their own parent info, and the child
   // will be removed when the last snap goes away)
-  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-  if ((m_image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
-      !m_image_ctx.snaps.empty()) {
+  RWLock::RLocker snap_locker(image_ctx.snap_lock);
+  if ((image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
+      !image_ctx.snaps.empty()) {
     return true;
   }
 
@@ -164,14 +175,17 @@ bool AsyncFlattenRequest::send_update_children() {
   m_state = STATE_UPDATE_CHILDREN;
 
   librados::ObjectWriteOperation op;
-  cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
+  cls_client::remove_child(&op, m_parent_spec, image_ctx.id);
 
-  librados::AioCompletion *rados_completion = create_callback_completion();
-  int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion,
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion,
     				     &op);
   assert(r == 0);
   rados_completion->release();
   return false;
 }
 
+} // namespace operation
 } // namespace librbd
+
+template class librbd::operation::FlattenRequest<librbd::ImageCtx>;
diff --git a/src/librbd/AsyncFlattenRequest.h b/src/librbd/operation/FlattenRequest.h
similarity index 70%
rename from src/librbd/AsyncFlattenRequest.h
rename to src/librbd/operation/FlattenRequest.h
index 01f1667..693b051 100644
--- a/src/librbd/AsyncFlattenRequest.h
+++ b/src/librbd/operation/FlattenRequest.h
@@ -1,9 +1,9 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_ASYNC_FLATTEN_REQUEST_H
-#define CEPH_LIBRBD_ASYNC_FLATTEN_REQUEST_H
+#ifndef CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
 
-#include "librbd/AsyncRequest.h"
+#include "librbd/operation/Request.h"
 #include "librbd/parent_types.h"
 #include "common/snap_types.h"
 
@@ -12,23 +12,29 @@ namespace librbd {
 class ImageCtx;
 class ProgressContext;
 
-class AsyncFlattenRequest : public AsyncRequest<>
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class FlattenRequest : public Request<ImageCtxT>
 {
 public:
-  AsyncFlattenRequest(ImageCtx &image_ctx, Context *on_finish,
-		      uint64_t object_size, uint64_t overlap_objects,
-		      const ::SnapContext &snapc, ProgressContext &prog_ctx)
-    : AsyncRequest(image_ctx, on_finish), m_object_size(object_size),
+  FlattenRequest(ImageCtxT &image_ctx, Context *on_finish,
+		 uint64_t object_size, uint64_t overlap_objects,
+		 const ::SnapContext &snapc, ProgressContext &prog_ctx)
+    : Request<ImageCtxT>(image_ctx, on_finish), m_object_size(object_size),
       m_overlap_objects(overlap_objects), m_snapc(snapc), m_prog_ctx(prog_ctx),
       m_ignore_enoent(false)
   {
   }
 
-  virtual void send();
-
 protected:
+  virtual void send_op();
   virtual bool should_complete(int r);
 
+  virtual journal::Event create_event() const {
+    return journal::FlattenEvent(0);
+  }
+
 private:
   /**
    * Flatten goes through the following state machine to copyup objects
@@ -76,6 +82,9 @@ private:
   bool send_update_children();
 };
 
+} // namespace operation
 } // namespace librbd
 
-#endif // CEPH_LIBRBD_ASYNC_FLATTEN_REQUEST_H
+extern template class librbd::operation::FlattenRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H
diff --git a/src/librbd/RebuildObjectMapRequest.cc b/src/librbd/operation/RebuildObjectMapRequest.cc
similarity index 60%
rename from src/librbd/RebuildObjectMapRequest.cc
rename to src/librbd/operation/RebuildObjectMapRequest.cc
index 8a3b29f..ce7f911 100644
--- a/src/librbd/RebuildObjectMapRequest.cc
+++ b/src/librbd/operation/RebuildObjectMapRequest.cc
@@ -1,16 +1,18 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
-#include "librbd/RebuildObjectMapRequest.h"
+#include "librbd/operation/RebuildObjectMapRequest.h"
 #include "common/dout.h"
 #include "common/errno.h"
 #include "librbd/AsyncObjectThrottle.h"
-#include "librbd/AsyncResizeRequest.h"
-#include "librbd/AsyncTrimRequest.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/operation/TrimRequest.h"
+#include "librbd/Utils.h"
 #include <boost/lambda/bind.hpp>
 #include <boost/lambda/construct.hpp>
 
@@ -19,25 +21,29 @@
 #define dout_prefix *_dout << "librbd::RebuildObjectMapRequest: "
 
 namespace librbd {
+namespace operation {
 
 namespace {
 
-class C_VerifyObject : public C_AsyncObjectThrottle<> {
+template <typename I>
+class C_VerifyObject : public C_AsyncObjectThrottle<I> {
 public:
-  C_VerifyObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
+  C_VerifyObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
                  uint64_t snap_id, uint64_t object_no)
-    : C_AsyncObjectThrottle(throttle, *image_ctx), m_snap_id(snap_id),
-      m_object_no(object_no), m_oid(m_image_ctx.get_object_name(m_object_no))
+    : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snap_id(snap_id),
+      m_object_no(object_no),
+      m_oid(image_ctx->get_object_name(m_object_no))
   {
-    m_io_ctx.dup(m_image_ctx.md_ctx);
+    m_io_ctx.dup(image_ctx->md_ctx);
     m_io_ctx.snap_set_read(CEPH_SNAPDIR);
   }
 
   virtual void complete(int r) {
+    I &image_ctx = this->m_image_ctx;
     if (should_complete(r)) {
-      ldout(m_image_ctx.cct, 20) << m_oid << " C_VerifyObject completed "
+      ldout(image_ctx.cct, 20) << m_oid << " C_VerifyObject completed "
                                  << dendl;
-      finish(r);
+      this->finish(r);
       delete this;
     }
   }
@@ -57,7 +63,8 @@ private:
   int m_snap_list_ret;
 
   bool should_complete(int r) {
-    CephContext *cct = m_image_ctx.cct;
+    I &image_ctx = this->m_image_ctx;
+    CephContext *cct = image_ctx.cct;
     if (r == 0) {
       r = m_snap_list_ret;
     }
@@ -73,23 +80,23 @@ private:
   }
 
   void send_list_snaps() {
-    assert(m_image_ctx.owner_lock.is_locked());
-    ldout(m_image_ctx.cct, 5) << m_oid << " C_VerifyObject::send_list_snaps"
-                              << dendl;
-
-    librados::AioCompletion *comp = librados::Rados::aio_create_completion(
-      this, NULL, rados_ctx_cb);
+    I &image_ctx = this->m_image_ctx;
+    assert(image_ctx.owner_lock.is_locked());
+    ldout(image_ctx.cct, 5) << m_oid << " C_VerifyObject::send_list_snaps"
+                            << dendl;
 
     librados::ObjectReadOperation op;
     op.list_snaps(&m_snap_set, &m_snap_list_ret);
 
+    librados::AioCompletion *comp = util::create_rados_safe_callback(this);
     int r = m_io_ctx.aio_operate(m_oid, comp, &op, NULL);
     assert(r == 0);
     comp->release();
   }
 
   uint8_t get_object_state() {
-    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    I &image_ctx = this->m_image_ctx;
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
     for (std::vector<librados::clone_info_t>::const_iterator r =
            m_snap_set.clones.begin(); r != m_snap_set.clones.end(); ++r) {
       librados::snap_t from_snap_id;
@@ -108,7 +115,7 @@ private:
         break;
       }
 
-      if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 &&
+      if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 &&
           from_snap_id != m_snap_id) {
         return OBJECT_EXISTS_CLEAN;
       }
@@ -118,26 +125,31 @@ private:
   }
 
   uint64_t next_valid_snap_id(uint64_t snap_id) {
-    assert(m_image_ctx.snap_lock.is_locked());
+    I &image_ctx = this->m_image_ctx;
+    assert(image_ctx.snap_lock.is_locked());
 
     std::map<librados::snap_t, SnapInfo>::iterator it =
-      m_image_ctx.snap_info.lower_bound(snap_id);
-    if (it == m_image_ctx.snap_info.end()) {
+      image_ctx.snap_info.lower_bound(snap_id);
+    if (it == image_ctx.snap_info.end()) {
       return CEPH_NOSNAP;
     }
     return it->first;
   }
 
   bool update_object_map(uint8_t new_state) {
-    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-    CephContext *cct = m_image_ctx.cct;
+    I &image_ctx = this->m_image_ctx;
+    RWLock::RLocker owner_locker(image_ctx.owner_lock);
+    CephContext *cct = image_ctx.cct;
 
     // should have been canceled prior to releasing lock
-    assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-           m_image_ctx.image_watcher->is_lock_owner());
+    assert(image_ctx.exclusive_lock == nullptr ||
+           image_ctx.exclusive_lock->is_lock_owner());
+
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    assert(image_ctx.object_map != nullptr);
 
-    RWLock::WLocker l(m_image_ctx.object_map_lock);
-    uint8_t state = m_image_ctx.object_map[m_object_no];
+    RWLock::WLocker l(image_ctx.object_map_lock);
+    uint8_t state = (*image_ctx.object_map)[m_object_no];
     if (state == OBJECT_EXISTS && new_state == OBJECT_NONEXISTENT &&
         m_snap_id == CEPH_NOSNAP) {
       // might be writing object to OSD concurrently
@@ -148,7 +160,7 @@ private:
       ldout(cct, 15) << m_oid << " C_VerifyObject::update_object_map "
                      << static_cast<uint32_t>(state) << "->"
                      << static_cast<uint32_t>(new_state) << dendl;
-      m_image_ctx.object_map[m_object_no] = new_state;
+      (*image_ctx.object_map)[m_object_no] = new_state;
     }
     return true;
   }
@@ -156,26 +168,26 @@ private:
 
 } // anonymous namespace
 
-
-void RebuildObjectMapRequest::send() {
+template <typename I>
+void RebuildObjectMapRequest<I>::send() {
   send_resize_object_map();
 }
 
-bool RebuildObjectMapRequest::should_complete(int r) {
+template <typename I>
+bool RebuildObjectMapRequest<I>::should_complete(int r) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
 
+  RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
   switch (m_state) {
   case STATE_RESIZE_OBJECT_MAP:
     ldout(cct, 5) << "RESIZE_OBJECT_MAP" << dendl;
     if (r == -ESTALE && !m_attempted_trim) {
       // objects are still flagged as in-use -- delete them
       m_attempted_trim = true;
-      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
       send_trim_image();
       return false;
     } else if (r == 0) {
-      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
       send_verify_objects();
     }
     break;
@@ -183,7 +195,6 @@ bool RebuildObjectMapRequest::should_complete(int r) {
   case STATE_TRIM_IMAGE:
     ldout(cct, 5) << "TRIM_IMAGE" << dendl;
     if (r == 0) {
-      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
       send_resize_object_map();
     }
     break;
@@ -191,7 +202,6 @@ bool RebuildObjectMapRequest::should_complete(int r) {
   case STATE_VERIFY_OBJECTS:
     ldout(cct, 5) << "VERIFY_OBJECTS" << dendl;
     if (r == 0) {
-      assert(m_image_ctx.owner_lock.is_locked());
       send_save_object_map();
     }
     break;
@@ -199,7 +209,6 @@ bool RebuildObjectMapRequest::should_complete(int r) {
   case STATE_SAVE_OBJECT_MAP:
     ldout(cct, 5) << "SAVE_OBJECT_MAP" << dendl;
     if (r == 0) {
-      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
       send_update_header();
     }
     break;
@@ -223,19 +232,19 @@ bool RebuildObjectMapRequest::should_complete(int r) {
   return false;
 }
 
-void RebuildObjectMapRequest::send_resize_object_map() {
+template <typename I>
+void RebuildObjectMapRequest<I>::send_resize_object_map() {
   assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
-  uint64_t num_objects;
-  uint64_t size;
-  {
-    RWLock::RLocker l(m_image_ctx.snap_lock);
-    size = get_image_size();
-    num_objects = Striper::get_num_objects(m_image_ctx.layout, size);
-  }
+  m_image_ctx.snap_lock.get_read();
+  assert(m_image_ctx.object_map != nullptr);
+
+  uint64_t size = get_image_size();
+  uint64_t num_objects = Striper::get_num_objects(m_image_ctx.layout, size);
 
-  if (m_image_ctx.object_map.size() == num_objects) {
+  if (m_image_ctx.object_map->size() == num_objects) {
+    m_image_ctx.snap_lock.put_read();
     send_verify_objects();
     return;
   }
@@ -244,20 +253,23 @@ void RebuildObjectMapRequest::send_resize_object_map() {
   m_state = STATE_RESIZE_OBJECT_MAP;
 
   // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-  m_image_ctx.object_map.aio_resize(size, OBJECT_NONEXISTENT,
-                                    create_callback_context());
+  assert(m_image_ctx.exclusive_lock == nullptr ||
+         m_image_ctx.exclusive_lock->is_lock_owner());
+
+  m_image_ctx.object_map->aio_resize(size, OBJECT_NONEXISTENT,
+                                     this->create_callback_context());
+  m_image_ctx.snap_lock.put_read();
 }
 
-void RebuildObjectMapRequest::send_trim_image() {
+template <typename I>
+void RebuildObjectMapRequest<I>::send_trim_image() {
   CephContext *cct = m_image_ctx.cct;
 
   RWLock::RLocker l(m_image_ctx.owner_lock);
 
   // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
+  assert(m_image_ctx.exclusive_lock == nullptr ||
+         m_image_ctx.exclusive_lock->is_lock_owner());
   ldout(cct, 5) << this << " send_trim_image" << dendl;
   m_state = STATE_TRIM_IMAGE;
 
@@ -265,18 +277,20 @@ void RebuildObjectMapRequest::send_trim_image() {
   uint64_t orig_size;
   {
     RWLock::RLocker l(m_image_ctx.snap_lock);
+    assert(m_image_ctx.object_map != nullptr);
+
     new_size = get_image_size();
     orig_size = m_image_ctx.get_object_size() *
-                m_image_ctx.object_map.size();
+                m_image_ctx.object_map->size();
   }
-  AsyncTrimRequest *req = new AsyncTrimRequest(m_image_ctx,
-                                               create_callback_context(),
-                                               orig_size, new_size,
-                                               m_prog_ctx);
+  TrimRequest<I> *req = new TrimRequest<I>(m_image_ctx,
+                                           this->create_callback_context(),
+                                           orig_size, new_size, m_prog_ctx);
   req->send();
 }
 
-void RebuildObjectMapRequest::send_verify_objects() {
+template <typename I>
+void RebuildObjectMapRequest<I>::send_verify_objects() {
   assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
@@ -297,16 +311,17 @@ void RebuildObjectMapRequest::send_verify_objects() {
   m_state = STATE_VERIFY_OBJECTS;
   ldout(cct, 5) << this << " send_verify_objects" << dendl;
 
-  AsyncObjectThrottle<>::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<C_VerifyObject>(),
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_VerifyObject<I> >(),
       boost::lambda::_1, &m_image_ctx, snap_id, boost::lambda::_2));
-  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
-    this, m_image_ctx, context_factory, create_callback_context(), &m_prog_ctx,
-    0, num_objects);
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, m_image_ctx, context_factory, this->create_callback_context(),
+    &m_prog_ctx, 0, num_objects);
   throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
 }
 
-void RebuildObjectMapRequest::send_save_object_map() {
+template <typename I>
+void RebuildObjectMapRequest<I>::send_save_object_map() {
   assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
@@ -314,30 +329,34 @@ void RebuildObjectMapRequest::send_save_object_map() {
   m_state = STATE_SAVE_OBJECT_MAP;
 
   // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
-  m_image_ctx.object_map.aio_save(create_callback_context());
+  assert(m_image_ctx.exclusive_lock == nullptr ||
+         m_image_ctx.exclusive_lock->is_lock_owner());
+
+  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+  assert(m_image_ctx.object_map != nullptr);
+  m_image_ctx.object_map->aio_save(this->create_callback_context());
 }
 
-void RebuildObjectMapRequest::send_update_header() {
+template <typename I>
+void RebuildObjectMapRequest<I>::send_update_header() {
   assert(m_image_ctx.owner_lock.is_locked());
 
   // should have been canceled prior to releasing lock
-  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
-         m_image_ctx.image_watcher->is_lock_owner());
+  assert(m_image_ctx.exclusive_lock == nullptr ||
+         m_image_ctx.exclusive_lock->is_lock_owner());
 
   ldout(m_image_ctx.cct, 5) << this << " send_update_header" << dendl;
   m_state = STATE_UPDATE_HEADER;
 
   librados::ObjectWriteOperation op;
-  if (m_image_ctx.image_watcher->is_lock_supported()) {
-    m_image_ctx.image_watcher->assert_header_locked(&op);
+  if (m_image_ctx.exclusive_lock != nullptr) {
+    m_image_ctx.exclusive_lock->assert_header_locked(&op);
   }
 
   uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID | RBD_FLAG_FAST_DIFF_INVALID;
   cls_client::set_flags(&op, m_image_ctx.snap_id, 0, flags);
 
-  librados::AioCompletion *comp = create_callback_completion();
+  librados::AioCompletion *comp = this->create_callback_completion();
   int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
   assert(r == 0);
   comp->release();
@@ -346,11 +365,12 @@ void RebuildObjectMapRequest::send_update_header() {
   m_image_ctx.update_flags(m_image_ctx.snap_id, flags, false);
 }
 
-uint64_t RebuildObjectMapRequest::get_image_size() const {
+template <typename I>
+uint64_t RebuildObjectMapRequest<I>::get_image_size() const {
   assert(m_image_ctx.snap_lock.is_locked());
   if (m_image_ctx.snap_id == CEPH_NOSNAP) {
-    if (!m_image_ctx.async_resize_reqs.empty()) {
-      return m_image_ctx.async_resize_reqs.front()->get_image_size();
+    if (!m_image_ctx.resize_reqs.empty()) {
+      return m_image_ctx.resize_reqs.front()->get_image_size();
     } else {
       return m_image_ctx.size;
     }
@@ -358,4 +378,7 @@ uint64_t RebuildObjectMapRequest::get_image_size() const {
   return  m_image_ctx.get_image_size(m_image_ctx.snap_id);
 }
 
+} // namespace operation
 } // namespace librbd
+
+template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>;
diff --git a/src/librbd/RebuildObjectMapRequest.h b/src/librbd/operation/RebuildObjectMapRequest.h
similarity index 73%
rename from src/librbd/RebuildObjectMapRequest.h
rename to src/librbd/operation/RebuildObjectMapRequest.h
index 02a41ef..2836187 100644
--- a/src/librbd/RebuildObjectMapRequest.h
+++ b/src/librbd/operation/RebuildObjectMapRequest.h
@@ -1,7 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_REBUILD_OBJECT_MAP_REQUEST_H
-#define CEPH_LIBRBD_REBUILD_OBJECT_MAP_REQUEST_H
+#ifndef CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
 
 #include "include/int_types.h"
 #include "librbd/AsyncRequest.h"
@@ -11,12 +11,15 @@ namespace librbd {
 class ImageCtx;
 class ProgressContext;
 
-class RebuildObjectMapRequest : public AsyncRequest<> {
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class RebuildObjectMapRequest : public AsyncRequest<ImageCtxT> {
 public:
 
-  RebuildObjectMapRequest(ImageCtx &image_ctx, Context *on_finish,
+  RebuildObjectMapRequest(ImageCtxT &image_ctx, Context *on_finish,
                           ProgressContext &prog_ctx)
-    : AsyncRequest(image_ctx, on_finish), m_image_ctx(image_ctx),
+    : AsyncRequest<ImageCtxT>(image_ctx, on_finish), m_image_ctx(image_ctx),
       m_prog_ctx(prog_ctx), m_attempted_trim(false)
   {
   }
@@ -58,7 +61,7 @@ private:
     STATE_UPDATE_HEADER
   };
 
-  ImageCtx &m_image_ctx;
+  ImageCtxT &m_image_ctx;
   ProgressContext &m_prog_ctx;
   State m_state;
   bool m_attempted_trim;
@@ -73,6 +76,9 @@ private:
 
 };
 
+} // namespace operation
 } // namespace librbd
 
-#endif // CEPH_LIBRBD_REBUILD_OBJECT_MAP_REQUEST_H
+extern template class librbd::operation::RebuildObjectMapRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/operation/RenameRequest.cc b/src/librbd/operation/RenameRequest.cc
new file mode 100644
index 0000000..aa5e09f
--- /dev/null
+++ b/src/librbd/operation/RenameRequest.cc
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/RenameRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::operation::RenameRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename RenameRequest<I>::State& state) {
+  switch(state) {
+  case RenameRequest<I>::STATE_READ_SOURCE_HEADER:
+    os << "READ_SOURCE_HEADER";
+    break;
+  case RenameRequest<I>::STATE_WRITE_DEST_HEADER:
+    os << "WRITE_DEST_HEADER";
+    break;
+  case RenameRequest<I>::STATE_UPDATE_DIRECTORY:
+    os << "UPDATE_DIRECTORY";
+    break;
+  case RenameRequest<I>::STATE_REMOVE_SOURCE_HEADER:
+    os << "REMOVE_SOURCE_HEADER";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+RenameRequest<I>::RenameRequest(I &image_ctx, Context *on_finish,
+				const std::string &dest_name)
+  : Request<I>(image_ctx, on_finish), m_dest_name(dest_name),
+    m_source_oid(image_ctx.old_format ? util::old_header_name(image_ctx.name) :
+                                        util::id_obj_name(image_ctx.name)),
+    m_dest_oid(image_ctx.old_format ? util::old_header_name(dest_name) :
+                                      util::id_obj_name(dest_name)) {
+}
+
+template <typename I>
+void RenameRequest<I>::send_op() {
+  send_read_source_header();
+}
+
+template <typename I>
+bool RenameRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  r = filter_state_return_code(r);
+  if (r < 0) {
+    lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+    return true;
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_READ_SOURCE_HEADER:
+    send_write_destination_header();
+    break;
+  case STATE_WRITE_DEST_HEADER:
+    send_update_directory();
+    break;
+  case STATE_UPDATE_DIRECTORY:
+    send_remove_source_header();
+    break;
+  case STATE_REMOVE_SOURCE_HEADER:
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+  return finished;
+}
+
+template <typename I>
+int RenameRequest<I>::filter_state_return_code(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+
+  if (m_state == STATE_REMOVE_SOURCE_HEADER && r < 0) {
+    if (r != -ENOENT) {
+      lderr(cct) << "warning: couldn't remove old source object ("
+                 << m_source_oid << ")" << dendl;
+    }
+    return 0;
+  }
+  return r;
+}
+
+template <typename I>
+void RenameRequest<I>::send_read_source_header() {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_READ_SOURCE_HEADER;
+
+  librados::ObjectReadOperation op;
+  op.read(0, 0, NULL, NULL);
+
+  // TODO: old code read omap values but there are no omap values on the
+  //       old format header nor the new format id object
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op,
+                                       &m_header_bl);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_write_destination_header() {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_WRITE_DEST_HEADER;
+
+  librados::ObjectWriteOperation op;
+  op.create(true);
+  op.write_full(m_header_bl);
+
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(m_dest_oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_update_directory() {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_UPDATE_DIRECTORY;
+
+  librados::ObjectWriteOperation op;
+  if (image_ctx.old_format) {
+    bufferlist cmd_bl;
+    bufferlist empty_bl;
+    ::encode(static_cast<__u8>(CEPH_OSD_TMAP_SET), cmd_bl);
+    ::encode(m_dest_name, cmd_bl);
+    ::encode(empty_bl, cmd_bl);
+    ::encode(static_cast<__u8>(CEPH_OSD_TMAP_RM), cmd_bl);
+    ::encode(image_ctx.name, cmd_bl);
+    op.tmap_update(cmd_bl);
+  } else {
+    cls_client::dir_rename_image(&op, image_ctx.name, m_dest_name,
+                                 image_ctx.id);
+  }
+
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+void RenameRequest<I>::send_remove_source_header() {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_REMOVE_SOURCE_HEADER;
+
+  librados::ObjectWriteOperation op;
+  op.remove();
+
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::RenameRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/RenameRequest.h b/src/librbd/operation/RenameRequest.h
new file mode 100644
index 0000000..474ce50
--- /dev/null
+++ b/src/librbd/operation/RenameRequest.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_RENAME_REQUEST_H
+#define CEPH_LIBRBD_RENAME_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class RenameRequest : public Request<ImageCtxT>
+{
+public:
+  /**
+   * Rename goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * STATE_READ_SOURCE_HEADER
+   *    |
+   *    v
+   * STATE_WRITE_DEST_HEADER
+   *    |
+   *    v
+   * STATE_UPDATE_DIRECTORY
+   *    |
+   *    v
+   * STATE_REMOVE_SOURCE_HEADER
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   *
+   */
+  enum State {
+    STATE_READ_SOURCE_HEADER,
+    STATE_WRITE_DEST_HEADER,
+    STATE_UPDATE_DIRECTORY,
+    STATE_REMOVE_SOURCE_HEADER
+  };
+
+  RenameRequest(ImageCtxT &image_ctx, Context *on_finish,
+                const std::string &dest_name);
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual journal::Event create_event() const {
+    return journal::RenameEvent(0, m_dest_name);
+  }
+
+private:
+  std::string m_dest_name;
+
+  std::string m_source_oid;
+  std::string m_dest_oid;
+
+  State m_state;
+
+  bufferlist m_header_bl;
+
+  int filter_state_return_code(int r);
+
+  void send_read_source_header();
+  void send_write_destination_header();
+  void send_update_directory();
+  void send_remove_source_header();
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::RenameRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_RENAME_REQUEST_H
diff --git a/src/librbd/operation/Request.cc b/src/librbd/operation/Request.cc
new file mode 100644
index 0000000..6a96d85
--- /dev/null
+++ b/src/librbd/operation/Request.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/Request.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+Request<I>::Request(I &image_ctx, Context *on_finish)
+  : AsyncRequest<I>(image_ctx, on_finish), m_tid(0) {
+}
+
+template <typename I>
+void Request<I>::send() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    if (image_ctx.journal != NULL &&
+        !image_ctx.journal->is_journal_replaying()) {
+      // journal might be replaying -- wait for it to complete
+      if (!image_ctx.journal->is_journal_ready()) {
+        image_ctx.journal->wait_for_journal_ready(
+          new C_WaitForJournalReady(this));
+        return;
+      }
+
+      journal::EventEntry event_entry(create_event());
+      m_tid = image_ctx.journal->append_op_event(event_entry);
+    }
+  }
+
+  send_op();
+}
+
+template <typename I>
+void Request<I>::finish(int r) {
+  {
+    I &image_ctx = this->m_image_ctx;
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    if (m_tid != 0 && image_ctx.journal != NULL &&
+        !image_ctx.journal->is_journal_replaying()) {
+      // ops will be canceled / completed before closing journal
+      assert(image_ctx.journal->is_journal_ready());
+
+      image_ctx.journal->commit_op_event(m_tid, r);
+    }
+  }
+
+  AsyncRequest<I>::finish(r);
+}
+
+template <typename I>
+void Request<I>::handle_journal_ready() {
+  I &image_ctx = this->m_image_ctx;
+  RWLock::RLocker owner_locker(image_ctx.owner_lock);
+  send();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::Request<librbd::ImageCtx>;
diff --git a/src/librbd/operation/Request.h b/src/librbd/operation/Request.h
new file mode 100644
index 0000000..c0dd3cb
--- /dev/null
+++ b/src/librbd/operation/Request.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_REQUEST_H
+
+#include "librbd/AsyncRequest.h"
+#include "include/Context.h"
+#include "librbd/JournalTypes.h"
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class Request : public AsyncRequest<ImageCtxT> {
+public:
+  Request(ImageCtxT &image_ctx, Context *on_finish);
+
+  virtual void send();
+
+protected:
+  virtual void finish(int r);
+  virtual void send_op() = 0;
+
+  virtual journal::Event create_event() const = 0;
+
+private:
+  struct C_WaitForJournalReady : public Context {
+    Request *request;
+
+    C_WaitForJournalReady(Request *_request) : request(_request) {
+    }
+
+    virtual void finish(int r) {
+      request->handle_journal_ready();
+    }
+  };
+
+  uint64_t m_tid;
+
+  void handle_journal_ready();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::Request<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_REQUEST_H
diff --git a/src/librbd/operation/ResizeRequest.cc b/src/librbd/operation/ResizeRequest.cc
new file mode 100644
index 0000000..dc92dbe
--- /dev/null
+++ b/src/librbd/operation/ResizeRequest.cc
@@ -0,0 +1,310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/ResizeRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/operation/TrimRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::ResizeRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+ResizeRequest<I>::ResizeRequest(I &image_ctx, Context *on_finish,
+                                uint64_t new_size, ProgressContext &prog_ctx)
+  : Request<I>(image_ctx, on_finish),
+    m_original_size(0), m_new_size(new_size), m_prog_ctx(prog_ctx),
+    m_new_parent_overlap(0), m_xlist_item(this)
+{
+}
+
+template <typename I>
+ResizeRequest<I>::~ResizeRequest() {
+  I &image_ctx = this->m_image_ctx;
+  ResizeRequest *next_req = NULL;
+  {
+    RWLock::WLocker snap_locker(image_ctx.snap_lock);
+    assert(m_xlist_item.remove_myself());
+    if (!image_ctx.resize_reqs.empty()) {
+      next_req = image_ctx.resize_reqs.front();
+    }
+  }
+
+  if (next_req != NULL) {
+    RWLock::RLocker owner_locker(image_ctx.owner_lock);
+    next_req->send();
+  }
+}
+
+template <typename I>
+bool ResizeRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "resize encountered an error: " << cpp_strerror(r) << dendl;
+    return true;
+  }
+  if (m_state == STATE_FINISHED) {
+    ldout(cct, 5) << "FINISHED" << dendl;
+    return true;
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  switch (m_state) {
+  case STATE_FLUSH:
+    ldout(cct, 5) << "FLUSH" << dendl;
+    send_invalidate_cache();
+    break;
+
+  case STATE_INVALIDATE_CACHE:
+    ldout(cct, 5) << "INVALIDATE_CACHE" << dendl;
+    send_trim_image();
+    break;
+
+  case STATE_TRIM_IMAGE:
+    ldout(cct, 5) << "TRIM_IMAGE" << dendl;
+    send_update_header();
+    break;
+
+  case STATE_GROW_OBJECT_MAP:
+    ldout(cct, 5) << "GROW_OBJECT_MAP" << dendl;
+    send_update_header();
+    break;
+
+  case STATE_UPDATE_HEADER:
+    ldout(cct, 5) << "UPDATE_HEADER" << dendl;
+    if (send_shrink_object_map()) {
+      update_size_and_overlap();
+      return true;
+    }
+    break;
+
+  case STATE_SHRINK_OBJECT_MAP:
+    ldout(cct, 5) << "SHRINK_OBJECT_MAP" << dendl;
+    update_size_and_overlap();
+    return true;
+
+  default:
+    lderr(cct) << "invalid state: " << m_state << dendl;
+    assert(false);
+    break;
+  }
+  return false;
+}
+
+template <typename I>
+void ResizeRequest<I>::send() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  {
+    RWLock::WLocker snap_locker(image_ctx.snap_lock);
+    if (!m_xlist_item.is_on_list()) {
+      image_ctx.resize_reqs.push_back(&m_xlist_item);
+      if (image_ctx.resize_reqs.front() != this) {
+        return;
+      }
+    }
+
+    assert(image_ctx.resize_reqs.front() == this);
+    m_original_size = image_ctx.size;
+    compute_parent_overlap();
+  }
+
+  Request<I>::send();
+}
+
+template <typename I>
+void ResizeRequest<I>::send_op() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  if (this->is_canceled()) {
+    this->async_complete(-ERESTART);
+  } else if (m_original_size == m_new_size) {
+    ldout(cct, 2) << this << " no change in size (" << m_original_size
+		  << " -> " << m_new_size << ")" << dendl;
+    m_state = STATE_FINISHED;
+    this->async_complete(0);
+  } else if (m_new_size > m_original_size) {
+    ldout(cct, 2) << this << " expanding image (" << m_original_size
+		  << " -> " << m_new_size << ")" << dendl;
+    send_grow_object_map();
+  } else {
+    ldout(cct, 2) << this << " shrinking image (" << m_original_size
+		  << " -> " << m_new_size << ")" << dendl;
+    send_flush();
+  }
+}
+
+template <typename I>
+void ResizeRequest<I>::send_flush() {
+  I &image_ctx = this->m_image_ctx;
+  ldout(image_ctx.cct, 5) << this << " send_flush: "
+                          << " original_size=" << m_original_size
+                          << " new_size=" << m_new_size << dendl;
+  m_state = STATE_FLUSH;
+
+  // with clipping adjusted, ensure that write / copy-on-read operations won't
+  // (re-)create objects that we just removed. need async callback to ensure
+  // we don't have cache_lock already held
+  image_ctx.flush_async_operations(this->create_async_callback_context());
+}
+
+template <typename I>
+void ResizeRequest<I>::send_invalidate_cache() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  ldout(image_ctx.cct, 5) << this << " send_invalidate_cache: "
+                          << " original_size=" << m_original_size
+                          << " new_size=" << m_new_size << dendl;
+  m_state = STATE_INVALIDATE_CACHE;
+
+  // need to invalidate since we're deleting objects, and
+  // ObjectCacher doesn't track non-existent objects
+  image_ctx.invalidate_cache(this->create_callback_context());
+}
+
+template <typename I>
+void ResizeRequest<I>::send_trim_image() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  ldout(image_ctx.cct, 5) << this << " send_trim_image: "
+                          << " original_size=" << m_original_size
+                          << " new_size=" << m_new_size << dendl;
+  m_state = STATE_TRIM_IMAGE;
+
+  TrimRequest<I> *req = new TrimRequest<I>(image_ctx,
+                                           this->create_callback_context(),
+				           m_original_size, m_new_size,
+                                           m_prog_ctx);
+  req->send();
+}
+
+template <typename I>
+void ResizeRequest<I>::send_grow_object_map() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  image_ctx.snap_lock.get_read();
+  if (image_ctx.object_map == nullptr) {
+    image_ctx.snap_lock.put_read();
+    send_update_header();
+    return;
+  }
+
+  ldout(image_ctx.cct, 5) << this << " send_grow_object_map: "
+                          << " original_size=" << m_original_size
+                          << " new_size=" << m_new_size << dendl;
+  m_state = STATE_GROW_OBJECT_MAP;
+
+  // should have been canceled prior to releasing lock
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+
+  image_ctx.object_map->aio_resize(m_new_size, OBJECT_NONEXISTENT,
+				   this->create_callback_context());
+  image_ctx.snap_lock.put_read();
+}
+
+template <typename I>
+bool ResizeRequest<I>::send_shrink_object_map() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  image_ctx.snap_lock.get_read();
+  if (image_ctx.object_map == nullptr || m_new_size > m_original_size) {
+    image_ctx.snap_lock.put_read();
+    return true;
+  }
+
+  ldout(image_ctx.cct, 5) << this << " send_shrink_object_map: "
+		            << " original_size=" << m_original_size
+			    << " new_size=" << m_new_size << dendl;
+  m_state = STATE_SHRINK_OBJECT_MAP;
+
+  // should have been canceled prior to releasing lock
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+
+  image_ctx.object_map->aio_resize(m_new_size, OBJECT_NONEXISTENT,
+				   this->create_callback_context());
+  image_ctx.snap_lock.put_read();
+  return false;
+}
+
+template <typename I>
+void ResizeRequest<I>::send_update_header() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  ldout(image_ctx.cct, 5) << this << " send_update_header: "
+                            << " original_size=" << m_original_size
+                            << " new_size=" << m_new_size << dendl;
+  m_state = STATE_UPDATE_HEADER;
+
+  // should have been canceled prior to releasing lock
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+
+  librados::ObjectWriteOperation op;
+  if (image_ctx.old_format) {
+    // rewrite only the size field of the header
+    // NOTE: format 1 image headers are not stored in fixed endian format
+    bufferlist bl;
+    bl.append(reinterpret_cast<const char*>(&m_new_size), sizeof(m_new_size));
+    op.write(offsetof(rbd_obj_header_ondisk, image_size), bl);
+  } else {
+    if (image_ctx.exclusive_lock != nullptr) {
+      image_ctx.exclusive_lock->assert_header_locked(&op);
+    }
+    cls_client::set_size(&op, m_new_size);
+  }
+
+  librados::AioCompletion *rados_completion =
+    this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+    				       rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+void ResizeRequest<I>::compute_parent_overlap() {
+  I &image_ctx = this->m_image_ctx;
+  RWLock::RLocker l2(image_ctx.parent_lock);
+  if (image_ctx.parent == NULL) {
+    m_new_parent_overlap = 0;
+  } else {
+    m_new_parent_overlap = MIN(m_new_size, image_ctx.parent_md.overlap);
+  }
+}
+
+template <typename I>
+void ResizeRequest<I>::update_size_and_overlap() {
+  I &image_ctx = this->m_image_ctx;
+  RWLock::WLocker snap_locker(image_ctx.snap_lock);
+  image_ctx.size = m_new_size;
+
+  RWLock::WLocker parent_locker(image_ctx.parent_lock);
+  if (image_ctx.parent != NULL && m_new_size < m_original_size) {
+    image_ctx.parent_md.overlap = m_new_parent_overlap;
+  }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::ResizeRequest<librbd::ImageCtx>;
diff --git a/src/librbd/AsyncResizeRequest.h b/src/librbd/operation/ResizeRequest.h
similarity index 79%
rename from src/librbd/AsyncResizeRequest.h
rename to src/librbd/operation/ResizeRequest.h
index 0acad6f..dc36ae3 100644
--- a/src/librbd/AsyncResizeRequest.h
+++ b/src/librbd/operation/ResizeRequest.h
@@ -1,9 +1,9 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_ASYNC_RESIZE_REQUEST_H
-#define CEPH_LIBRBD_ASYNC_RESIZE_REQUEST_H
+#ifndef CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
 
-#include "librbd/AsyncRequest.h"
+#include "librbd/operation/Request.h"
 #include "include/xlist.h"
 
 namespace librbd
@@ -12,14 +12,14 @@ namespace librbd
 class ImageCtx;
 class ProgressContext;
 
-class AsyncResizeRequest : public AsyncRequest<>
-{
-public:
-  AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish, uint64_t new_size,
-                     ProgressContext &prog_ctx);
-  virtual ~AsyncResizeRequest();
+namespace operation {
 
-  virtual void send();
+template <typename ImageCtxT = ImageCtx>
+class ResizeRequest : public Request<ImageCtxT> {
+public:
+  ResizeRequest(ImageCtxT &image_ctx, Context *on_finish, uint64_t new_size,
+                ProgressContext &prog_ctx);
+  virtual ~ResizeRequest();
 
   inline bool shrinking() const {
     return m_new_size < m_original_size;
@@ -29,6 +29,16 @@ public:
     return m_new_size;
   }
 
+  virtual void send();
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual journal::Event create_event() const {
+    return journal::ResizeEvent(0, m_new_size);
+  }
+
 private:
   /**
    * Resize goes through the following state machine to resize the image
@@ -80,10 +90,7 @@ private:
   ProgressContext &m_prog_ctx;
   uint64_t m_new_parent_overlap;
 
-  xlist<AsyncResizeRequest *>::item m_xlist_item;
-
-  virtual bool safely_cancel(int r);
-  virtual bool should_complete(int r);
+  typename xlist<ResizeRequest<ImageCtxT>*>::item m_xlist_item;
 
   void send_flush();
   void send_invalidate_cache();
@@ -97,6 +104,9 @@ private:
 
 };
 
+} // namespace operation
 } // namespace librbd
 
-#endif // CEPH_LIBRBD_ASYNC_RESIZE_REQUEST_H
+extern template class librbd::operation::ResizeRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotCreateRequest.cc b/src/librbd/operation/SnapshotCreateRequest.cc
new file mode 100644
index 0000000..63f19d4
--- /dev/null
+++ b/src/librbd/operation/SnapshotCreateRequest.cc
@@ -0,0 +1,323 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotCreateRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename SnapshotCreateRequest<I>::State& state) {
+  switch(state) {
+  case SnapshotCreateRequest<I>::STATE_SUSPEND_REQUESTS:
+    os << "SUSPEND_REQUESTS";
+    break;
+  case SnapshotCreateRequest<I>::STATE_SUSPEND_AIO:
+    os << "SUSPEND_AIO";
+    break;
+  case SnapshotCreateRequest<I>::STATE_ALLOCATE_SNAP_ID:
+    os << "ALLOCATE_SNAP_ID";
+    break;
+  case SnapshotCreateRequest<I>::STATE_CREATE_SNAP:
+    os << "CREATE_SNAP";
+    break;
+  case SnapshotCreateRequest<I>::STATE_CREATE_OBJECT_MAP:
+    os << "CREATE_OBJECT_MAP";
+    break;
+  case SnapshotCreateRequest<I>::STATE_RELEASE_SNAP_ID:
+    os << "RELEASE_SNAP_ID";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotCreateRequest<I>::SnapshotCreateRequest(I &image_ctx,
+                                                Context *on_finish,
+                                                const std::string &snap_name)
+  : Request<I>(image_ctx, on_finish), m_snap_name(snap_name), m_ret_val(0),
+    m_aio_suspended(false), m_requests_suspended(false),
+    m_snap_id(CEPH_NOSNAP), m_snap_created(false) {
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_op() {
+  send_suspend_requests();
+}
+
+template <typename I>
+bool SnapshotCreateRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  int orig_result = r;
+  r = filter_state_return_code(r);
+  if (r < 0) {
+    lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+    if (m_ret_val == 0) {
+      m_ret_val = r;
+    }
+  }
+
+  if (m_ret_val < 0) {
+    return should_complete_error();
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_SUSPEND_REQUESTS:
+    send_suspend_aio();
+    break;
+  case STATE_SUSPEND_AIO:
+    send_allocate_snap_id();
+    break;
+  case STATE_ALLOCATE_SNAP_ID:
+    send_create_snap();
+    break;
+  case STATE_CREATE_SNAP:
+    if (orig_result == 0) {
+      update_snap_context();
+      finished = send_create_object_map();
+    } else {
+      assert(orig_result == -ESTALE);
+      send_allocate_snap_id();
+    }
+    break;
+  case STATE_CREATE_OBJECT_MAP:
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+
+  if (finished) {
+    resume_aio();
+    resume_requests();
+  }
+  return finished;
+}
+
+template <typename I>
+bool SnapshotCreateRequest<I>::should_complete_error() {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  lderr(cct) << this << " " << __func__ << ": "
+             << "ret_val=" << m_ret_val << dendl;
+
+  // only valid exit points during error recovery
+  bool finished = true;
+  if (m_state != STATE_RELEASE_SNAP_ID) {
+    finished = send_release_snap_id();
+  }
+
+  if (finished) {
+    resume_aio();
+    resume_requests();
+  }
+  return finished;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_suspend_requests() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  // TODO suspend (shrink) resize to ensure consistent RBD mirror
+  send_suspend_aio();
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_suspend_aio() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  m_state = STATE_SUSPEND_AIO;
+  m_aio_suspended = true;
+
+  // can issue a re-entrant callback if no IO in-progress
+  image_ctx.aio_work_queue->block_writes(this->create_async_callback_context());
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_allocate_snap_id() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_ALLOCATE_SNAP_ID;
+
+  // TODO create an async version of selfmanaged_snap_create
+  int r = image_ctx.md_ctx.selfmanaged_snap_create(&m_snap_id);
+  this->async_complete(r);
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::send_create_snap() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  RWLock::RLocker snap_locker(image_ctx.snap_lock);
+  RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_CREATE_SNAP;
+
+  // should have been canceled prior to releasing lock
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+
+  // save current size / parent info for creating snapshot record in ImageCtx
+  m_size = image_ctx.size;
+  m_parent_info = image_ctx.parent_md;
+
+  librados::ObjectWriteOperation op;
+  if (image_ctx.old_format) {
+    cls_client::old_snapshot_add(&op, m_snap_id, m_snap_name);
+  } else {
+    if (image_ctx.exclusive_lock != nullptr) {
+      image_ctx.exclusive_lock->assert_header_locked(&op);
+    }
+    cls_client::snapshot_add(&op, m_snap_id, m_snap_name);
+  }
+
+  librados::AioCompletion *rados_completion =
+    this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+                                         rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+bool SnapshotCreateRequest<I>::send_create_object_map() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    RWLock::RLocker object_map_lock(image_ctx.object_map_lock);
+    if (image_ctx.object_map != nullptr) {
+      CephContext *cct = image_ctx.cct;
+      ldout(cct, 5) << this << " " << __func__ << dendl;
+      m_state = STATE_CREATE_OBJECT_MAP;
+
+      image_ctx.object_map->snapshot_add(m_snap_id,
+                                         this->create_callback_context());
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename I>
+bool SnapshotCreateRequest<I>::send_release_snap_id() {
+  I &image_ctx = this->m_image_ctx;
+  if (m_snap_id != CEPH_NOSNAP && !m_snap_created) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 5) << this << " " << __func__ << ": snap_id=" << m_snap_id
+                  << dendl;
+    m_state = STATE_RELEASE_SNAP_ID;
+
+    // TODO add async version of selfmanaged_snap_remove
+    int r = image_ctx.md_ctx.selfmanaged_snap_remove(m_snap_id);
+    m_snap_id = CEPH_NOSNAP;
+
+    this->async_complete(r);
+    return false;
+  }
+  return true;
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::resume_aio() {
+  I &image_ctx = this->m_image_ctx;
+  if (m_aio_suspended) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 5) << this << " " << __func__ << dendl;
+
+    image_ctx.aio_work_queue->unblock_writes();
+    m_aio_suspended = false;
+  }
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::resume_requests() {
+  I &image_ctx = this->m_image_ctx;
+  if (m_requests_suspended) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 5) << this << " " << __func__ << dendl;
+
+    // TODO
+    m_requests_suspended = false;
+  }
+}
+
+template <typename I>
+void SnapshotCreateRequest<I>::update_snap_context() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  m_snap_created = true;
+
+  RWLock::WLocker snap_locker(image_ctx.snap_lock);
+  if (image_ctx.old_format) {
+    return;
+  }
+
+  if (image_ctx.get_snap_info(m_snap_id) != NULL) {
+    return;
+  }
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  // should have been canceled prior to releasing lock
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+
+  // immediately add a reference to the new snapshot
+  image_ctx.add_snap(m_snap_name, m_snap_id, m_size, m_parent_info,
+                     RBD_PROTECTION_STATUS_UNPROTECTED, 0);
+
+  // immediately start using the new snap context if we
+  // own the exclusive lock
+  std::vector<snapid_t> snaps;
+  snaps.push_back(m_snap_id);
+  snaps.insert(snaps.end(), image_ctx.snapc.snaps.begin(),
+               image_ctx.snapc.snaps.end());
+
+  image_ctx.snapc.seq = m_snap_id;
+  image_ctx.snapc.snaps.swap(snaps);
+  image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(
+    image_ctx.snapc.seq, image_ctx.snaps);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotCreateRequest.h b/src/librbd/operation/SnapshotCreateRequest.h
new file mode 100644
index 0000000..249bcaf
--- /dev/null
+++ b/src/librbd/operation/SnapshotCreateRequest.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "librbd/parent_types.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotCreateRequest : public Request<ImageCtxT> {
+public:
+  /**
+   * Snap Create goes through the following state machine:
+   *
+   * @verbatim
+   *
+   *            <start>
+   *               |
+   *               v
+   *           STATE_SUSPEND_REQUESTS
+   *               |
+   *               v
+   *           STATE_SUSPEND_AIO * * * * * * * * * * * * *
+   *               |                                     *
+   *   (retry)     v                                     *
+   *   . . . > STATE_ALLOCATE_SNAP_ID  * *               *
+   *   .           |                     *               *
+   *   .           v                     *               *
+   *   . . . . STATE_CREATE_SNAP * * * * *               *
+   *               |                     *               *
+   *               v                     *               *
+   *           STATE_CREATE_OBJECT_MAP   *               *
+   *               |                     *               *
+   *               |                     *               *
+   *               |                     v               *
+   *               |              STATE_RELEASE_SNAP_ID  *
+   *               |                     |               *
+   *               |                     v               *
+   *               \----------------> <finish> < * * * * *
+   *
+   * @endverbatim
+   *
+   * The _CREATE_STATE state may repeat back to the _ALLOCATE_SNAP_ID state
+   * if a stale snapshot context is allocated. If the create operation needs
+   * to abort, the error path is followed to record the result in the journal
+   * (if enabled) and bubble the originating error code back to the client.
+   */
+  enum State {
+    STATE_SUSPEND_REQUESTS,
+    STATE_SUSPEND_AIO,
+    STATE_ALLOCATE_SNAP_ID,
+    STATE_CREATE_SNAP,
+    STATE_CREATE_OBJECT_MAP,
+    STATE_RELEASE_SNAP_ID
+  };
+
+  SnapshotCreateRequest(ImageCtxT &image_ctx, Context *on_finish,
+		        const std::string &snap_name);
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual int filter_return_code(int r) const {
+    if (m_ret_val < 0) {
+      return m_ret_val;
+    }
+    return r;
+  }
+
+  virtual journal::Event create_event() const {
+    return journal::SnapCreateEvent(0, m_snap_name);
+  }
+
+private:
+  std::string m_snap_name;
+  State m_state;
+
+  int m_ret_val;
+
+  bool m_aio_suspended;
+  bool m_requests_suspended;
+
+  uint64_t m_snap_id;
+  bool m_snap_created;
+
+  uint64_t m_size;
+  parent_info m_parent_info;
+
+  int filter_state_return_code(int r) const {
+    if (m_state == STATE_CREATE_SNAP && r == -ESTALE) {
+      return 0;
+    }
+    return r;
+  }
+
+  bool should_complete_error();
+
+  void send_suspend_requests();
+  void send_suspend_aio();
+  void send_allocate_snap_id();
+  void send_create_snap();
+  bool send_create_object_map();
+  bool send_release_snap_id();
+
+  void resume_aio();
+  void resume_requests();
+  void update_snap_context();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotCreateRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotProtectRequest.cc b/src/librbd/operation/SnapshotProtectRequest.cc
new file mode 100644
index 0000000..9ba415e
--- /dev/null
+++ b/src/librbd/operation/SnapshotProtectRequest.cc
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotProtectRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename SnapshotProtectRequest<I>::State& state) {
+  switch(state) {
+  case SnapshotProtectRequest<I>::STATE_PROTECT_SNAP:
+    os << "PROTECT_SNAP";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotProtectRequest<I>::SnapshotProtectRequest(I &image_ctx,
+                                                  Context *on_finish,
+                                                  const std::string &snap_name)
+  : Request<I>(image_ctx, on_finish), m_snap_name(snap_name) {
+}
+
+template <typename I>
+void SnapshotProtectRequest<I>::send_op() {
+  send_protect_snap();
+}
+
+template <typename I>
+bool SnapshotProtectRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  if (r < 0) {
+    lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+  }
+  return true;
+}
+
+template <typename I>
+void SnapshotProtectRequest<I>::send_protect_snap() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  m_state = STATE_PROTECT_SNAP;
+
+  int r = verify_and_send_protect_snap();
+  if (r < 0) {
+    this->async_complete(r);
+    return;
+  }
+}
+
+template <typename I>
+int SnapshotProtectRequest<I>::verify_and_send_protect_snap() {
+  I &image_ctx = this->m_image_ctx;
+  RWLock::RLocker md_locker(image_ctx.md_lock);
+  RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+  CephContext *cct = image_ctx.cct;
+  if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) {
+    lderr(cct) << "image must support layering" << dendl;
+    return -ENOSYS;
+  }
+
+  uint64_t snap_id = image_ctx.get_snap_id(m_snap_name);
+  if (snap_id == CEPH_NOSNAP) {
+    return -ENOENT;
+  }
+
+  bool is_protected;
+  int r = image_ctx.is_snap_protected(snap_id, &is_protected);
+  if (r < 0) {
+    return r;
+  }
+
+  if (is_protected) {
+    return -EBUSY;
+  }
+
+  librados::ObjectWriteOperation op;
+  cls_client::set_protection_status(&op, snap_id,
+                                    RBD_PROTECTION_STATUS_PROTECTED);
+
+  librados::AioCompletion *rados_completion =
+    this->create_callback_completion();
+  r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion,
+                                     &op);
+  assert(r == 0);
+  rados_completion->release();
+  return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotProtectRequest.h b/src/librbd/operation/SnapshotProtectRequest.h
new file mode 100644
index 0000000..02484e9
--- /dev/null
+++ b/src/librbd/operation/SnapshotProtectRequest.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotProtectRequest : public Request<ImageCtxT> {
+public:
+  /**
+   * Snap Protect goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * STATE_PROTECT_SNAP
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   *
+   */
+  enum State {
+    STATE_PROTECT_SNAP
+  };
+
+  SnapshotProtectRequest(ImageCtxT &image_ctx, Context *on_finish,
+		         const std::string &snap_name);
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual journal::Event create_event() const {
+    return journal::SnapProtectEvent(0, m_snap_name);
+  }
+
+private:
+  std::string m_snap_name;
+  State m_state;
+
+  void send_protect_snap();
+
+  int verify_and_send_protect_snap();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotProtectRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRemoveRequest.cc b/src/librbd/operation/SnapshotRemoveRequest.cc
new file mode 100644
index 0000000..81e0b96
--- /dev/null
+++ b/src/librbd/operation/SnapshotRemoveRequest.cc
@@ -0,0 +1,236 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/ObjectMap.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRemoveRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename SnapshotRemoveRequest<I>::State& state) {
+  switch(state) {
+  case SnapshotRemoveRequest<I>::STATE_REMOVE_OBJECT_MAP:
+    os << "REMOVE_OBJECT_MAP";
+    break;
+  case SnapshotRemoveRequest<I>::STATE_REMOVE_CHILD:
+    os << "REMOVE_CHILD";
+    break;
+  case SnapshotRemoveRequest<I>::STATE_REMOVE_SNAP:
+    os << "REMOVE_SNAP";
+    break;
+  case SnapshotRemoveRequest<I>::STATE_RELEASE_SNAP_ID:
+    os << "RELEASE_SNAP_ID";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRemoveRequest<I>::SnapshotRemoveRequest(I &image_ctx,
+						Context *on_finish,
+						const std::string &snap_name,
+						uint64_t snap_id)
+  : Request<I>(image_ctx, on_finish), m_snap_name(snap_name),
+    m_snap_id(snap_id) {
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_op() {
+  send_remove_object_map();
+}
+
+template <typename I>
+bool SnapshotRemoveRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  r = filter_state_return_code(r);
+  if (r < 0) {
+    return true;
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_REMOVE_OBJECT_MAP:
+    send_remove_child();
+    break;
+  case STATE_REMOVE_CHILD:
+    send_remove_snap();
+    break;
+  case STATE_REMOVE_SNAP:
+    remove_snap_context();
+    send_release_snap_id();
+    break;
+  case STATE_RELEASE_SNAP_ID:
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+
+  return finished;
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_remove_object_map() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  {
+    RWLock::WLocker snap_locker(image_ctx.snap_lock);
+    RWLock::RLocker object_map_locker(image_ctx.object_map_lock);
+    if (image_ctx.object_map != nullptr) {
+      CephContext *cct = image_ctx.cct;
+      ldout(cct, 5) << this << " " << __func__ << dendl;
+      m_state = STATE_REMOVE_OBJECT_MAP;
+
+      image_ctx.object_map->snapshot_remove(
+        m_snap_id, this->create_callback_context());
+      return;
+    }
+  }
+  send_remove_child();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_remove_child() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+    parent_spec our_pspec;
+    int r = image_ctx.get_parent_spec(m_snap_id, &our_pspec);
+    if (r < 0) {
+      lderr(cct) << "failed to retrieve parent spec" << dendl;
+      this->async_complete(r);
+      return;
+    }
+
+    if (image_ctx.parent_md.spec != our_pspec &&
+        (scan_for_parents(our_pspec) == -ENOENT)) {
+      // no other references to the parent image
+      ldout(cct, 5) << this << " " << __func__ << dendl;
+      m_state = STATE_REMOVE_CHILD;
+
+      librados::ObjectWriteOperation op;
+      cls_client::remove_child(&op, our_pspec, image_ctx.id);
+
+      librados::AioCompletion *rados_completion = this->create_callback_completion();
+      r = image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion, &op);
+      assert(r == 0);
+      rados_completion->release();
+      return;
+    }
+  }
+
+  // HEAD image or other snapshots still associated with parent
+  send_remove_snap();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_remove_snap() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_REMOVE_SNAP;
+
+  librados::ObjectWriteOperation op;
+  if (image_ctx.old_format) {
+    cls_client::old_snapshot_remove(&op, m_snap_name);
+  } else {
+    if (image_ctx.exclusive_lock != nullptr &&
+        image_ctx.exclusive_lock->is_lock_owner()) {
+      image_ctx.exclusive_lock->assert_header_locked(&op);
+    }
+    cls_client::snapshot_remove(&op, m_snap_id);
+  }
+
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+                                       rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::send_release_snap_id() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": "
+                << "snap_name=" << m_snap_name << ", "
+                << "snap_id=" << m_snap_id << dendl;
+  m_state = STATE_RELEASE_SNAP_ID;
+
+  // TODO add async version of selfmanaged_snap_remove
+  int r = image_ctx.md_ctx.selfmanaged_snap_remove(m_snap_id);
+  this->async_complete(r);
+}
+
+template <typename I>
+void SnapshotRemoveRequest<I>::remove_snap_context() {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  RWLock::WLocker snap_locker(image_ctx.snap_lock);
+  image_ctx.rm_snap(m_snap_name, m_snap_id);
+}
+
+template <typename I>
+int SnapshotRemoveRequest<I>::scan_for_parents(parent_spec &pspec) {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.snap_lock.is_locked());
+  assert(image_ctx.parent_lock.is_locked());
+
+  if (pspec.pool_id != -1) {
+    map<uint64_t, SnapInfo>::iterator it;
+    for (it = image_ctx.snap_info.begin();
+         it != image_ctx.snap_info.end(); ++it) {
+      // skip our snap id (if checking base image, CEPH_NOSNAP won't match)
+      if (it->first == m_snap_id) {
+        continue;
+      }
+      if (it->second.parent.spec == pspec) {
+        break;
+      }
+    }
+    if (it == image_ctx.snap_info.end()) {
+      return -ENOENT;
+    }
+  }
+  return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRemoveRequest.h b/src/librbd/operation/SnapshotRemoveRequest.h
new file mode 100644
index 0000000..ea950a5
--- /dev/null
+++ b/src/librbd/operation/SnapshotRemoveRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "librbd/parent_types.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRemoveRequest : public Request<ImageCtxT> {
+public:
+  /**
+   * Snap Remove goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start> ------\
+   *  .            |
+   *  .            v
+   *  .     STATE_REMOVE_OBJECT_MAP
+   *  .            |            .
+   *  .            v            .
+   *  . . > STATE_REMOVE_CHILD  .
+   *  .            |            .
+   *  .            |      . . . .
+   *  .            |      .
+   *  .            v      v
+   *  . . > STATE_REMOVE_SNAP
+   *               |
+   *               v
+   *        STATE_RELEASE_SNAP_ID
+   *               |
+   *               v
+   *           <finish>
+   *
+   * @endverbatim
+   *
+   * The _REMOVE_OBJECT_MAP state is skipped if the object map is not enabled.
+   * The _REMOVE_CHILD state is skipped if the parent is still in-use.
+   */
+  enum State {
+    STATE_REMOVE_OBJECT_MAP,
+    STATE_REMOVE_CHILD,
+    STATE_REMOVE_SNAP,
+    STATE_RELEASE_SNAP_ID
+  };
+
+  SnapshotRemoveRequest(ImageCtxT &image_ctx, Context *on_finish,
+		        const std::string &snap_name, uint64_t snap_id);
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual journal::Event create_event() const {
+    return journal::SnapRemoveEvent(0, m_snap_name);
+  }
+
+private:
+  std::string m_snap_name;
+  uint64_t m_snap_id;
+  State m_state;
+
+  int filter_state_return_code(int r) const {
+    if (m_state == STATE_REMOVE_CHILD && r == -ENOENT) {
+      return 0;
+    }
+    return r;
+  }
+
+  void send_remove_object_map();
+  void send_remove_child();
+  void send_remove_snap();
+  void send_release_snap_id();
+
+  void remove_snap_context();
+  int scan_for_parents(parent_spec &pspec);
+
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRenameRequest.cc b/src/librbd/operation/SnapshotRenameRequest.cc
new file mode 100644
index 0000000..ec7eb65
--- /dev/null
+++ b/src/librbd/operation/SnapshotRenameRequest.cc
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRenameRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRenameRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename SnapshotRenameRequest<I>::State& state) {
+  switch(state) {
+  case SnapshotRenameRequest<I>::STATE_RENAME_SNAP:
+    os << "RENAME_SNAP";
+    break;
+  }
+  return os;
+}
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRenameRequest<I>::SnapshotRenameRequest(I &image_ctx,
+						Context *on_finish,
+						uint64_t snap_id,
+						const std::string &snap_name)
+  : Request<I>(image_ctx, on_finish), m_snap_id(snap_id), m_snap_name(snap_name) {
+}
+
+template <typename I>
+void SnapshotRenameRequest<I>::send_op() {
+  send_rename_snap();
+}
+
+template <typename I>
+bool SnapshotRenameRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  if (r < 0) {
+    lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+  }
+  return true;
+}
+
+template <typename I>
+void SnapshotRenameRequest<I>::send_rename_snap() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  RWLock::RLocker md_locker(image_ctx.md_lock);
+  RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  m_state = STATE_RENAME_SNAP;
+
+  librados::ObjectWriteOperation op;
+  if (image_ctx.old_format) {
+    cls_client::old_snapshot_rename(&op, m_snap_id, m_snap_name);
+  } else {
+    if (image_ctx.exclusive_lock != nullptr &&
+        image_ctx.exclusive_lock->is_lock_owner()) {
+      image_ctx.exclusive_lock->assert_header_locked(&op);
+    }
+    cls_client::snapshot_rename(&op, m_snap_id, m_snap_name);
+  }
+
+  librados::AioCompletion *rados_completion = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid,
+                                       rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRenameRequest.h b/src/librbd/operation/SnapshotRenameRequest.h
new file mode 100644
index 0000000..19a72c7
--- /dev/null
+++ b/src/librbd/operation/SnapshotRenameRequest.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRenameRequest : public Request<ImageCtxT> {
+public:
+  /**
+   * Snap Rename goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * STATE_RENAME_SNAP
+   *    |
+   *    v
+   * <finish>
+   *
+   * @endverbatim
+   *
+   */
+  enum State {
+    STATE_RENAME_SNAP
+  };
+
+  SnapshotRenameRequest(ImageCtxT &image_ctx, Context *on_finish,
+                        uint64_t snap_id, const std::string &snap_name);
+
+  virtual journal::Event create_event() const {
+    return journal::SnapRenameEvent(0, m_snap_id, m_snap_name);
+  }
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+private:
+  uint64_t m_snap_id;
+  std::string m_snap_name;
+  State m_state;
+
+  void send_rename_snap();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRenameRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H
diff --git a/src/librbd/operation/SnapshotRollbackRequest.cc b/src/librbd/operation/SnapshotRollbackRequest.cc
new file mode 100644
index 0000000..f7df78b
--- /dev/null
+++ b/src/librbd/operation/SnapshotRollbackRequest.cc
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotRollbackRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/operation/ResizeRequest.h"
+#include "osdc/Striper.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotRollbackRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename SnapshotRollbackRequest<I>::State& state) {
+  switch(state) {
+  case SnapshotRollbackRequest<I>::STATE_RESIZE_IMAGE:
+    os << "RESIZE_IMAGE";
+    break;
+  case SnapshotRollbackRequest<I>::STATE_ROLLBACK_OBJECT_MAP:
+    os << "ROLLBACK_OBJECT_MAP";
+    break;
+  case SnapshotRollbackRequest<I>::STATE_ROLLBACK_OBJECTS:
+    os << "ROLLBACK_OBJECTS";
+    break;
+  case SnapshotRollbackRequest<I>::STATE_REFRESH_OBJECT_MAP:
+    os << "REFRESH_OBJECT_MAP";
+    break;
+  case SnapshotRollbackRequest<I>::STATE_INVALIDATE_CACHE:
+    os << "INVALIDATE_CACHE";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+template <typename I>
+class C_RollbackObject : public C_AsyncObjectThrottle<> {
+public:
+  C_RollbackObject(AsyncObjectThrottle<> &throttle, I *image_ctx,
+                   uint64_t snap_id, uint64_t object_num)
+    : C_AsyncObjectThrottle(throttle, *image_ctx), m_snap_id(snap_id),
+      m_object_num(object_num) {
+  }
+
+  virtual int send() {
+    I &image_ctx = this->m_image_ctx;
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 20) << "C_RollbackObject: " << __func__ << ": object_num="
+                   << m_object_num << dendl;
+
+    std::string oid = image_ctx.get_object_name(m_object_num);
+
+    librados::ObjectWriteOperation op;
+    op.selfmanaged_snap_rollback(m_snap_id);
+
+    librados::AioCompletion *rados_completion =
+      util::create_rados_safe_callback(this);
+    image_ctx.data_ctx.aio_operate(oid, rados_completion, &op);
+    rados_completion->release();
+    return 0;
+  }
+
+private:
+  uint64_t m_snap_id;
+  uint64_t m_object_num;
+};
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotRollbackRequest<I>::SnapshotRollbackRequest(I &image_ctx,
+                                                    Context *on_finish,
+                                                    const std::string &snap_name,
+                                                    uint64_t snap_id,
+                                                    uint64_t snap_size,
+                                                    ProgressContext &prog_ctx)
+  : Request<I>(image_ctx, on_finish), m_snap_name(snap_name),
+    m_snap_id(snap_id), m_snap_size(snap_size), m_prog_ctx(prog_ctx),
+    m_object_map(nullptr) {
+}
+
+template <typename I>
+SnapshotRollbackRequest<I>::~SnapshotRollbackRequest() {
+  delete m_object_map;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_op() {
+  send_resize_image();
+}
+
+template <typename I>
+bool SnapshotRollbackRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  if (r < 0) {
+    lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+    return true;
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_RESIZE_IMAGE:
+    send_rollback_object_map();
+    break;
+  case STATE_ROLLBACK_OBJECT_MAP:
+    send_rollback_objects();
+    break;
+  case STATE_ROLLBACK_OBJECTS:
+    finished = send_refresh_object_map();
+    break;
+  case STATE_REFRESH_OBJECT_MAP:
+    finished = send_invalidate_cache();
+    break;
+  case STATE_INVALIDATE_CACHE:
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+  return finished;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_resize_image() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  uint64_t current_size;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    current_size = image_ctx.get_image_size(CEPH_NOSNAP);
+  }
+
+  if (current_size == m_snap_size) {
+    send_rollback_object_map();
+    return;
+  }
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_RESIZE_IMAGE;
+
+  ResizeRequest<I> *req = new ResizeRequest<I>(image_ctx,
+                                               this->create_callback_context(),
+                                               m_snap_size, m_no_op_prog_ctx);
+  req->send();
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_rollback_object_map() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    RWLock::WLocker object_map_lock(image_ctx.object_map_lock);
+    if (image_ctx.object_map != nullptr) {
+      CephContext *cct = image_ctx.cct;
+      ldout(cct, 5) << this << " " << __func__ << dendl;
+      m_state = STATE_ROLLBACK_OBJECT_MAP;
+
+      image_ctx.object_map->rollback(m_snap_id,
+                                     this->create_callback_context());
+      return;
+    }
+  }
+
+  send_rollback_objects();
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::send_rollback_objects() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_ROLLBACK_OBJECTS;
+
+  uint64_t num_objects;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    num_objects = Striper::get_num_objects(image_ctx.layout,
+                                           image_ctx.get_current_size());
+  }
+
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_RollbackObject<I> >(),
+      boost::lambda::_1, &image_ctx, m_snap_id, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, num_objects);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
+}
+
+template <typename I>
+bool SnapshotRollbackRequest<I>::send_refresh_object_map() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  if (image_ctx.object_map == nullptr) {
+    return send_invalidate_cache();
+  }
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_REFRESH_OBJECT_MAP;
+
+  m_object_map = image_ctx.create_object_map(CEPH_NOSNAP);
+
+  image_ctx.owner_lock.put_read();
+  Context *ctx = this->create_callback_context();
+  m_object_map->open(ctx);
+  image_ctx.owner_lock.get_read();
+
+  return false;
+}
+
+template <typename I>
+bool SnapshotRollbackRequest<I>::send_invalidate_cache() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  apply();
+
+  if (image_ctx.object_cacher == NULL) {
+    return true;
+  }
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_INVALIDATE_CACHE;
+
+  image_ctx.invalidate_cache(this->create_callback_context());
+  return false;
+}
+
+template <typename I>
+void SnapshotRollbackRequest<I>::apply() {
+  I &image_ctx = this->m_image_ctx;
+
+  assert(image_ctx.owner_lock.is_locked());
+  RWLock::WLocker snap_locker(image_ctx.snap_lock);
+  if (image_ctx.object_map != nullptr) {
+    std::swap(m_object_map, image_ctx.object_map);
+  }
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotRollbackRequest.h b/src/librbd/operation/SnapshotRollbackRequest.h
new file mode 100644
index 0000000..54dd742
--- /dev/null
+++ b/src/librbd/operation/SnapshotRollbackRequest.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotRollbackRequest : public Request<ImageCtxT> {
+public:
+  /**
+   * Snap Rollback goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start> ---------\
+   *  .               |
+   *  .               v
+   *  .         STATE_RESIZE_IMAGE
+   *  .               |
+   *  . (skip path)   v
+   *  . . . . > STATE_ROLLBACK_OBJECT_MAP
+   *  .               |
+   *  .               v
+   *  . . . . > STATE_ROLLBACK_OBJECTS . . . . . . . . . . .
+   *                  |                                    .
+   *                  v                                    .
+   *            STATE_REFRESH_OBJECT_MAP  (skip if object  .
+   *                  |                    map disabled)   .
+   *                  v                                    .
+   *            STATE_INVALIDATE_CACHE                     .
+   *                  |                                    .
+   *                  v                                    .
+   *              <finish> < . . . . . . . . . . . . . . . .
+   *
+   * @endverbatim
+   *
+   * The _RESIZE_IMAGE state is skipped if the image doesn't need to be resized.
+   * The _ROLLBACK_OBJECT_MAP state is skipped if the object map isn't enabled.
+   * The _INVALIDATE_CACHE state is skipped if the cache isn't enabled.
+   */
+  enum State {
+    STATE_RESIZE_IMAGE,
+    STATE_ROLLBACK_OBJECT_MAP,
+    STATE_ROLLBACK_OBJECTS,
+    STATE_REFRESH_OBJECT_MAP,
+    STATE_INVALIDATE_CACHE
+  };
+
+  SnapshotRollbackRequest(ImageCtxT &image_ctx, Context *on_finish,
+                          const std::string &snap_name, uint64_t snap_id,
+                          uint64_t snap_size, ProgressContext &prog_ctx);
+  virtual ~SnapshotRollbackRequest();
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual journal::Event create_event() const {
+    return journal::SnapRollbackEvent(0, m_snap_name);
+  }
+
+private:
+  std::string m_snap_name;
+  uint64_t m_snap_id;
+  uint64_t m_snap_size;
+  ProgressContext &m_prog_ctx;
+
+  NoOpProgressContext m_no_op_prog_ctx;
+  State m_state;
+
+  decltype(ImageCtxT::object_map) m_object_map;
+
+  void send_resize_image();
+  void send_rollback_object_map();
+  void send_rollback_objects();
+  bool send_refresh_object_map();
+  bool send_invalidate_cache();
+
+  void apply();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotRollbackRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H
diff --git a/src/librbd/operation/SnapshotUnprotectRequest.cc b/src/librbd/operation/SnapshotUnprotectRequest.cc
new file mode 100644
index 0000000..65b7167
--- /dev/null
+++ b/src/librbd/operation/SnapshotUnprotectRequest.cc
@@ -0,0 +1,350 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/parent_types.h"
+#include "librbd/Utils.h"
+#include <list>
+#include <set>
+#include <vector>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::SnapshotUnprotectRequest: "
+
+namespace librbd {
+namespace operation {
+
+namespace {
+
+typedef std::pair<int64_t, std::string> Pool;
+typedef std::vector<Pool> Pools;
+
+template <typename I>
+std::ostream& operator<<(std::ostream& os,
+                         const typename SnapshotUnprotectRequest<I>::State& state) {
+  switch(state) {
+  case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_START:
+    os << "UNPROTECT_SNAP_START";
+    break;
+  case SnapshotUnprotectRequest<I>::STATE_SCAN_POOL_CHILDREN:
+    os << "SCAN_POOL_CHILDREN";
+    break;
+  case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_FINISH:
+    os << "UNPROTECT_SNAP_FINISH";
+    break;
+  case SnapshotUnprotectRequest<I>::STATE_UNPROTECT_SNAP_ROLLBACK:
+    os << "UNPROTECT_SNAP_ROLLBACK";
+    break;
+  default:
+    os << "UNKNOWN (" << static_cast<uint32_t>(state) << ")";
+    break;
+  }
+  return os;
+}
+
+template <typename I>
+class C_ScanPoolChildren : public C_AsyncObjectThrottle<I> {
+public:
+  C_ScanPoolChildren(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+                     const parent_spec &pspec, const Pools &pools,
+                     size_t pool_idx)
+    : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_pspec(pspec),
+      m_pool(pools[pool_idx]) {
+  }
+
+  virtual int send() {
+    I &image_ctx = this->m_image_ctx;
+    assert(image_ctx.owner_lock.is_locked());
+
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 10) << this << " scanning pool '" << m_pool.second << "'"
+                   << dendl;
+
+    librados::Rados rados(image_ctx.md_ctx);
+    int64_t base_tier;
+    int r = rados.pool_get_base_tier(m_pool.first, &base_tier);
+    if (r == -ENOENT) {
+      ldout(cct, 1) << "pool '" << m_pool.second << "' no longer exists"
+                    << dendl;
+      return 1;
+    } else if (r < 0) {
+      lderr(cct) << "error retrieving base tier for pool '"
+                 << m_pool.second << "'" << dendl;
+      return r;
+    }
+    if (m_pool.first != base_tier) {
+      // pool is a cache; skip it
+      return 1;
+    }
+
+    r = rados.ioctx_create2(m_pool.first, m_pool_ioctx);
+    if (r == -ENOENT) {
+      ldout(cct, 1) << "pool '" << m_pool.second << "' no longer exists"
+                    << dendl;
+      return 1;
+    } else if (r < 0) {
+      lderr(cct) << "can't create ioctx for pool '" << m_pool.second
+                 << "'" << dendl;
+      return r;
+    }
+
+    librados::ObjectReadOperation op;
+    cls_client::get_children_start(&op, m_pspec);
+
+    librados::AioCompletion *rados_completion =
+      util::create_rados_ack_callback(this);
+    r = m_pool_ioctx.aio_operate(RBD_CHILDREN, rados_completion, &op,
+                                 &m_children_bl);
+    assert(r == 0);
+    rados_completion->release();
+    return 0;
+  }
+
+protected:
+  virtual void finish(int r) {
+    I &image_ctx = this->m_image_ctx;
+    CephContext *cct = image_ctx.cct;
+
+    if (r == 0) {
+      bufferlist::iterator it = m_children_bl.begin();
+      r= cls_client::get_children_finish(&it, &m_children);
+    }
+
+    ldout(cct, 10) << this << " retrieved children: r=" << r << dendl;
+    if (r == -ENOENT) {
+      // no children -- proceed with unprotect
+      r = 0;
+    } else if (r < 0) {
+      lderr(cct) << "cannot get children for pool '" << m_pool.second << "'"
+                 << dendl;
+    } else {
+      lderr(cct) << "cannot unprotect: at least " << m_children.size() << " "
+                 << "child(ren) [" << joinify(m_children.begin(),
+                                              m_children.end(),
+                                              std::string(",")) << "] "
+                 << "in pool '" << m_pool.second << "'" << dendl;
+      r = -EBUSY;
+    }
+    C_AsyncObjectThrottle<I>::finish(r);
+  }
+
+private:
+  parent_spec m_pspec;
+  Pool m_pool;
+
+  IoCtx m_pool_ioctx;
+  std::set<std::string> m_children;
+  bufferlist m_children_bl;
+};
+
+} // anonymous namespace
+
+template <typename I>
+SnapshotUnprotectRequest<I>::SnapshotUnprotectRequest(I &image_ctx,
+                                                      Context *on_finish,
+                                                      const std::string &snap_name)
+  : Request<I>(image_ctx, on_finish), m_snap_name(snap_name), m_ret_val(0),
+    m_snap_id(CEPH_NOSNAP) {
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_op() {
+  send_unprotect_snap_start();
+}
+
+template <typename I>
+bool SnapshotUnprotectRequest<I>::should_complete(int r) {
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", "
+                << "r=" << r << dendl;
+  if (r < 0) {
+    lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl;
+    if (m_ret_val == 0) {
+      m_ret_val = r;
+    }
+  }
+
+  // use a different state machine once an error is encountered
+  if (m_ret_val < 0) {
+    return should_complete_error();
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  bool finished = false;
+  switch (m_state) {
+  case STATE_UNPROTECT_SNAP_START:
+    send_scan_pool_children();
+    break;
+  case STATE_SCAN_POOL_CHILDREN:
+    send_unprotect_snap_finish();
+    break;
+  case STATE_UNPROTECT_SNAP_FINISH:
+    finished = true;
+    break;
+  default:
+    assert(false);
+    break;
+  }
+  return finished;
+}
+
+template <typename I>
+bool SnapshotUnprotectRequest<I>::should_complete_error() {
+  I &image_ctx = this->m_image_ctx;
+  RWLock::RLocker owner_locker(image_ctx.owner_lock);
+  CephContext *cct = image_ctx.cct;
+  lderr(cct) << this << " " << __func__ << ": "
+             << "ret_val=" << m_ret_val << dendl;
+
+  bool finished = true;
+  if (m_state == STATE_SCAN_POOL_CHILDREN ||
+      m_state == STATE_UNPROTECT_SNAP_FINISH) {
+    send_unprotect_snap_rollback();
+    finished = false;
+  }
+  return finished;
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_start() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  m_state = STATE_UNPROTECT_SNAP_START;
+
+  int r = verify_and_send_unprotect_snap_start();
+  if (r < 0) {
+    this->async_complete(r);
+    return;
+  }
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_scan_pool_children() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+  m_state = STATE_SCAN_POOL_CHILDREN;
+
+  // search all pools for children depending on this snapshot
+  // TODO add async version of wait_for_latest_osdmap
+  librados::Rados rados(image_ctx.md_ctx);
+  rados.wait_for_latest_osdmap();
+
+  // protect against pools being renamed/deleted
+  std::list<Pool> pool_list;
+  rados.pool_list2(pool_list);
+
+  parent_spec pspec(image_ctx.md_ctx.get_id(), image_ctx.id, m_snap_id);
+  Pools pools(pool_list.begin(), pool_list.end());
+
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_ScanPoolChildren<I> >(),
+      boost::lambda::_1, &image_ctx, pspec, pools, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, NULL, 0, pools.size());
+  throttle->start_ops(image_ctx.concurrent_management_ops);
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_finish() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  m_state = STATE_UNPROTECT_SNAP_FINISH;
+
+  librados::ObjectWriteOperation op;
+  cls_client::set_protection_status(&op, m_snap_id,
+                                    RBD_PROTECTION_STATUS_UNPROTECTED);
+
+  librados::AioCompletion *comp = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+void SnapshotUnprotectRequest<I>::send_unprotect_snap_rollback() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " " << __func__ << dendl;
+
+  m_state = STATE_UNPROTECT_SNAP_ROLLBACK;
+
+  librados::ObjectWriteOperation op;
+  cls_client::set_protection_status(&op, m_snap_id,
+                                    RBD_PROTECTION_STATUS_PROTECTED);
+
+  librados::AioCompletion *comp = this->create_callback_completion();
+  int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+int SnapshotUnprotectRequest<I>::verify_and_send_unprotect_snap_start() {
+  I &image_ctx = this->m_image_ctx;
+  RWLock::RLocker md_locker(image_ctx.md_lock);
+  RWLock::RLocker snap_locker(image_ctx.snap_lock);
+
+  CephContext *cct = image_ctx.cct;
+  if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) {
+    lderr(cct) << "image must support layering" << dendl;
+    return -ENOSYS;
+  }
+
+  m_snap_id = image_ctx.get_snap_id(m_snap_name);
+  if (m_snap_id == CEPH_NOSNAP) {
+    return -ENOENT;
+  }
+
+  bool is_unprotected;
+  int r = image_ctx.is_snap_unprotected(m_snap_id, &is_unprotected);
+  if (r < 0) {
+    return r;
+  }
+
+  if (is_unprotected) {
+    lderr(cct) << "snapshot is already unprotected" << dendl;
+    return -EINVAL;
+  }
+
+  librados::ObjectWriteOperation op;
+  cls_client::set_protection_status(&op, m_snap_id,
+                                    RBD_PROTECTION_STATUS_UNPROTECTING);
+
+  librados::AioCompletion *comp = this->create_callback_completion();
+  r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+
+  // TODO legacy code threw a notification post UNPROTECTING update -- required?
+  return 0;
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>;
diff --git a/src/librbd/operation/SnapshotUnprotectRequest.h b/src/librbd/operation/SnapshotUnprotectRequest.h
new file mode 100644
index 0000000..3b940c3
--- /dev/null
+++ b/src/librbd/operation/SnapshotUnprotectRequest.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
+
+#include "librbd/operation/Request.h"
+#include <iosfwd>
+#include <string>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class SnapshotUnprotectRequest : public Request<ImageCtxT> {
+public:
+  /**
+   * Snap Unprotect goes through the following state machine:
+   *
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * STATE_UNPROTECT_SNAP_START
+   *    |
+   *    v
+   * STATE_SCAN_POOL_CHILDREN * * * * > STATE_UNPROTECT_SNAP_ROLLBACK
+   *    |                                  |
+   *    v                                  |
+   * STATE_UNPROTECT_SNAP_FINISH           |
+   *    |                                  |
+   *    v                                  |
+   * <finish> <----------------------------/
+   *
+   * @endverbatim
+   *
+   * If the unprotect operation needs to abort, the error path is followed
+   * to rollback the unprotect in-progress status on the image.
+   */
+  enum State {
+    STATE_UNPROTECT_SNAP_START,
+    STATE_SCAN_POOL_CHILDREN,
+    STATE_UNPROTECT_SNAP_FINISH,
+    STATE_UNPROTECT_SNAP_ROLLBACK
+  };
+
+  SnapshotUnprotectRequest(ImageCtxT &image_ctx, Context *on_finish,
+		           const std::string &snap_name);
+
+protected:
+  virtual void send_op();
+  virtual bool should_complete(int r);
+
+  virtual int filter_return_code(int r) const {
+    if (m_ret_val < 0) {
+      return m_ret_val;
+    }
+    return 0;
+  }
+
+  virtual journal::Event create_event() const {
+    return journal::SnapUnprotectEvent(0, m_snap_name);
+  }
+
+private:
+  std::string m_snap_name;
+  State m_state;
+
+  int m_ret_val;
+  uint64_t m_snap_id;
+
+  bool should_complete_error();
+
+  void send_unprotect_snap_start();
+  void send_scan_pool_children();
+  void send_unprotect_snap_finish();
+  void send_unprotect_snap_rollback();
+
+  int verify_and_send_unprotect_snap_start();
+};
+
+} // namespace operation
+} // namespace librbd
+
+extern template class librbd::operation::SnapshotUnprotectRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H
diff --git a/src/librbd/operation/TrimRequest.cc b/src/librbd/operation/TrimRequest.cc
new file mode 100644
index 0000000..8e32f3e
--- /dev/null
+++ b/src/librbd/operation/TrimRequest.cc
@@ -0,0 +1,385 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/operation/TrimRequest.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/AioObjectRequest.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "common/ContextCompletion.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include <boost/bind.hpp>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+#include <boost/scope_exit.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::TrimRequest: "
+
+namespace librbd {
+namespace operation {
+
+template <typename I>
+class C_CopyupObject : public C_AsyncObjectThrottle<I> {
+public:
+  C_CopyupObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+                 ::SnapContext snapc, uint64_t object_no)
+    : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_snapc(snapc),
+      m_object_no(object_no)
+  {
+  }
+
+  virtual int send() {
+    I &image_ctx = this->m_image_ctx;
+    assert(image_ctx.owner_lock.is_locked());
+    assert(image_ctx.exclusive_lock == nullptr ||
+           image_ctx.exclusive_lock->is_lock_owner());
+
+    string oid = image_ctx.get_object_name(m_object_no);
+    ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
+
+    AioObjectRequest *req = new AioObjectTrim(&image_ctx, oid, m_object_no,
+                                              m_snapc, this);
+    req->send();
+    return 0;
+  }
+private:
+  ::SnapContext m_snapc;
+  uint64_t m_object_no;
+};
+
+template <typename I>
+class C_RemoveObject : public C_AsyncObjectThrottle<I> {
+public:
+  C_RemoveObject(AsyncObjectThrottle<I> &throttle, ImageCtx *image_ctx,
+                 uint64_t object_no)
+    : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no)
+  {
+  }
+
+  virtual int send() {
+    I &image_ctx = this->m_image_ctx;
+    assert(image_ctx.owner_lock.is_locked());
+    assert(image_ctx.exclusive_lock == nullptr ||
+           image_ctx.exclusive_lock->is_lock_owner());
+
+    {
+      RWLock::RLocker snap_locker(image_ctx.snap_lock);
+      if (image_ctx.object_map != nullptr &&
+          !image_ctx.object_map->object_may_exist(m_object_no)) {
+        return 1;
+      }
+    }
+
+    string oid = image_ctx.get_object_name(m_object_no);
+    ldout(image_ctx.cct, 10) << "removing " << oid << dendl;
+
+    librados::AioCompletion *rados_completion =
+      util::create_rados_safe_callback(this);
+    int r = image_ctx.data_ctx.aio_remove(oid, rados_completion);
+    assert(r == 0);
+    rados_completion->release();
+    return 0;
+  }
+
+private:
+  uint64_t m_object_no;
+};
+
+template <typename I>
+TrimRequest<I>::TrimRequest(I &image_ctx, Context *on_finish,
+                            uint64_t original_size, uint64_t new_size,
+                            ProgressContext &prog_ctx)
+  : AsyncRequest<I>(image_ctx, on_finish), m_new_size(new_size),
+    m_prog_ctx(prog_ctx)
+{
+  uint64_t period = image_ctx.get_stripe_period();
+  uint64_t new_num_periods = ((m_new_size + period - 1) / period);
+  m_delete_off = MIN(new_num_periods * period, original_size);
+  // first object we can delete free and clear
+  m_delete_start = new_num_periods * image_ctx.get_stripe_count();
+  m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size);
+
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 10) << this << " trim image " << original_size << " -> "
+		 << m_new_size << " periods " << new_num_periods
+                 << " discard to offset " << m_delete_off
+                 << " delete objects " << m_delete_start
+                 << " to " << m_num_objects << dendl;
+}
+
+template <typename I>
+bool TrimRequest<I>::should_complete(int r)
+{
+  I &image_ctx = this->m_image_ctx;
+  CephContext *cct = image_ctx.cct;
+  ldout(cct, 5) << this << " should_complete: r=" << r << dendl;
+  if (r < 0) {
+    lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl;
+    return true;
+  }
+
+  RWLock::RLocker owner_lock(image_ctx.owner_lock);
+  switch (m_state) {
+  case STATE_COPYUP_OBJECTS:
+    ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
+    send_pre_remove();
+    break;
+
+  case STATE_PRE_REMOVE:
+    ldout(cct, 5) << " PRE_REMOVE" << dendl;
+    send_remove_objects();
+    break;
+
+  case STATE_REMOVE_OBJECTS:
+    ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
+    send_post_remove();
+    break;
+
+  case STATE_POST_REMOVE:
+    ldout(cct, 5) << " POST_OBJECTS" << dendl;
+    send_clean_boundary();
+    break;
+
+  case STATE_CLEAN_BOUNDARY:
+    ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
+    send_finish(0);
+    break;
+
+  case STATE_FINISHED:
+    ldout(cct, 5) << "FINISHED" << dendl;
+    return true;
+
+  default:
+    lderr(cct) << "invalid state: " << m_state << dendl;
+    assert(false);
+    break;
+  }
+  return false;
+}
+
+template <typename I>
+void TrimRequest<I>::send() {
+  send_copyup_objects();
+}
+
+template <typename I>
+void TrimRequest<I>::send_copyup_objects() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+
+  if (m_delete_start >= m_num_objects) {
+    send_clean_boundary();
+    return;
+  }
+
+  ::SnapContext snapc;
+  bool has_snapshots;
+  uint64_t parent_overlap;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    RWLock::RLocker parent_locker(image_ctx.parent_lock);
+
+    snapc = image_ctx.snapc;
+    has_snapshots = !image_ctx.snaps.empty();
+    int r = image_ctx.get_parent_overlap(image_ctx.get_copyup_snap_id(),
+                                           &parent_overlap);
+    assert(r == 0);
+  }
+
+  // copyup is only required for portion of image that overlaps parent
+  uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout,
+                                                 parent_overlap);
+  // TODO: protect against concurrent shrink and snap create?
+  if (copyup_end <= m_delete_start || !has_snapshots) {
+    send_pre_remove();
+    return;
+  }
+
+  uint64_t copyup_start = m_delete_start;
+  m_delete_start = copyup_end;
+
+  ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
+			    << " start object=" << copyup_start << ", "
+			    << " end object=" << copyup_end << dendl;
+  m_state = STATE_COPYUP_OBJECTS;
+
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
+      boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+    copyup_end);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
+}
+
+template <typename I>
+void TrimRequest<I>::send_remove_objects() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
+			    << " delete_start=" << m_delete_start
+			    << " num_objects=" << m_num_objects << dendl;
+  m_state = STATE_REMOVE_OBJECTS;
+
+  Context *ctx = this->create_callback_context();
+  typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
+      boost::lambda::_1, &image_ctx, boost::lambda::_2));
+  AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+    this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
+    m_num_objects);
+  throttle->start_ops(image_ctx.concurrent_management_ops);
+}
+
+template <typename I>
+void TrimRequest<I>::send_pre_remove() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  if (m_delete_start >= m_num_objects) {
+    send_clean_boundary();
+    return;
+  }
+
+  bool remove_objects = false;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    if (image_ctx.object_map == nullptr) {
+      remove_objects = true;
+    } else {
+      ldout(image_ctx.cct, 5) << this << " send_pre_remove: "
+				<< " delete_start=" << m_delete_start
+				<< " num_objects=" << m_num_objects << dendl;
+      m_state = STATE_PRE_REMOVE;
+
+      assert(image_ctx.exclusive_lock->is_lock_owner());
+
+      // flag the objects as pending deletion
+      Context *ctx = this->create_callback_context();
+      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+      if (!image_ctx.object_map->aio_update(m_delete_start, m_num_objects,
+					    OBJECT_PENDING, OBJECT_EXISTS,
+                                            ctx)) {
+        delete ctx;
+        remove_objects = true;
+      }
+    }
+  }
+
+  // avoid possible recursive lock attempts
+  if (remove_objects) {
+    // no object map update required
+    send_remove_objects();
+  }
+}
+
+template <typename I>
+void TrimRequest<I>::send_post_remove() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+
+  bool clean_boundary = false;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    if (image_ctx.object_map == nullptr) {
+      clean_boundary = true;
+    } else {
+      ldout(image_ctx.cct, 5) << this << " send_post_remove: "
+          		        << " delete_start=" << m_delete_start
+          		        << " num_objects=" << m_num_objects << dendl;
+      m_state = STATE_POST_REMOVE;
+
+      assert(image_ctx.exclusive_lock->is_lock_owner());
+
+      // flag the pending objects as removed
+      Context *ctx = this->create_callback_context();
+      RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+      if (!image_ctx.object_map->aio_update(m_delete_start, m_num_objects,
+					    OBJECT_NONEXISTENT,
+					    OBJECT_PENDING, ctx)) {
+        delete ctx;
+	clean_boundary = true;
+      }
+    }
+  }
+
+  // avoid possible recursive lock attempts
+  if (clean_boundary) {
+    // no object map update required
+    send_clean_boundary();
+  }
+}
+
+template <typename I>
+void TrimRequest<I>::send_clean_boundary() {
+  I &image_ctx = this->m_image_ctx;
+  assert(image_ctx.owner_lock.is_locked());
+  CephContext *cct = image_ctx.cct;
+  if (m_delete_off <= m_new_size) {
+    send_finish(0);
+    return;
+  }
+
+  // should have been canceled prior to releasing lock
+  assert(image_ctx.exclusive_lock == nullptr ||
+         image_ctx.exclusive_lock->is_lock_owner());
+  uint64_t delete_len = m_delete_off - m_new_size;
+  ldout(image_ctx.cct, 5) << this << " send_clean_boundary: "
+			    << " delete_off=" << m_delete_off
+			    << " length=" << delete_len << dendl;
+  m_state = STATE_CLEAN_BOUNDARY;
+
+  ::SnapContext snapc;
+  {
+    RWLock::RLocker snap_locker(image_ctx.snap_lock);
+    snapc = image_ctx.snapc;
+  }
+
+  // discard the weird boundary
+  std::vector<ObjectExtent> extents;
+  Striper::file_to_extents(cct, image_ctx.format_string,
+			   &image_ctx.layout, m_new_size, delete_len, 0,
+                           extents);
+
+  ContextCompletion *completion =
+    new ContextCompletion(this->create_async_callback_context(), true);
+  for (vector<ObjectExtent>::iterator p = extents.begin();
+       p != extents.end(); ++p) {
+    ldout(cct, 20) << " ex " << *p << dendl;
+    Context *req_comp = new C_ContextCompletion(*completion);
+
+    AioObjectRequest *req;
+    if (p->offset == 0) {
+      req = new AioObjectTrim(&image_ctx, p->oid.name, p->objectno, snapc,
+                              req_comp);
+    } else {
+      req = new AioObjectTruncate(&image_ctx, p->oid.name, p->objectno,
+                                  p->offset, snapc, req_comp);
+    }
+    req->send();
+  }
+  completion->finish_adding_requests();
+}
+
+template <typename I>
+void TrimRequest<I>::send_finish(int r) {
+  m_state = STATE_FINISHED;
+  this->async_complete(r);
+}
+
+} // namespace operation
+} // namespace librbd
+
+template class librbd::operation::TrimRequest<librbd::ImageCtx>;
diff --git a/src/librbd/AsyncTrimRequest.h b/src/librbd/operation/TrimRequest.h
similarity index 83%
rename from src/librbd/AsyncTrimRequest.h
rename to src/librbd/operation/TrimRequest.h
index 2160c40..6e6c50c 100644
--- a/src/librbd/AsyncTrimRequest.h
+++ b/src/librbd/operation/TrimRequest.h
@@ -1,7 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_ASYNC_TRIM_REQUEST_H
-#define CEPH_LIBRBD_ASYNC_TRIM_REQUEST_H
+#ifndef CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
+#define CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
 
 #include "librbd/AsyncRequest.h"
 
@@ -11,12 +11,15 @@ namespace librbd
 class ImageCtx;
 class ProgressContext;
 
-class AsyncTrimRequest : public AsyncRequest<>
+namespace operation {
+
+template <typename ImageCtxT = ImageCtx>
+class TrimRequest : public AsyncRequest<ImageCtxT>
 {
 public:
-  AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish,
-		   uint64_t original_size, uint64_t new_size,
-		   ProgressContext &prog_ctx);
+  TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
+	      uint64_t original_size, uint64_t new_size,
+	      ProgressContext &prog_ctx);
 
   virtual void send();
 
@@ -82,9 +85,12 @@ private:
   void send_pre_remove();
   void send_post_remove();
   void send_clean_boundary();
-  void finish(int r);
+  void send_finish(int r);
 };
 
+} // namespace operation
 } // namespace librbd
 
-#endif // CEPH_LIBRBD_ASYNC_TRIM_REQUEST_H
+extern template class librbd::operation::TrimRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H
diff --git a/src/librbd/parent_types.h b/src/librbd/parent_types.h
index 5c54953..5e26794 100644
--- a/src/librbd/parent_types.h
+++ b/src/librbd/parent_types.h
@@ -3,16 +3,20 @@
 #ifndef CEPH_LIBRBD_PARENT_TYPES_H
 #define CEPH_LIBRBD_PARENT_TYPES_H
 
+#include "include/int_types.h"
+#include "include/types.h"
+#include <string>
+
 namespace librbd {
   /** @brief Unique identification of a parent in clone relationship.
    * Cloning an image creates a child image that keeps a reference
    * to its parent. This allows copy-on-write images. */
   struct parent_spec {
     int64_t pool_id;
-    string image_id;
+    std::string image_id;
     snapid_t snap_id;
     parent_spec() : pool_id(-1), snap_id(CEPH_NOSNAP) {}
-    parent_spec(uint64_t pool_id, string image_id, snapid_t snap_id) :
+    parent_spec(uint64_t pool_id, std::string image_id, snapid_t snap_id) :
       pool_id(pool_id), image_id(image_id), snap_id(snap_id) {}
     bool operator==(const parent_spec &other) {
       return ((this->pool_id == other.pool_id) &&
diff --git a/src/log/Entry.h b/src/log/Entry.h
index 7cdf116..1b589e1 100644
--- a/src/log/Entry.h
+++ b/src/log/Entry.h
@@ -9,7 +9,6 @@
 #include <pthread.h>
 #include <string>
 
-#define CEPH_LOG_ENTRY_PREALLOC 80
 
 namespace ceph {
 namespace log {
@@ -20,19 +19,38 @@ struct Entry {
   short m_prio, m_subsys;
   Entry *m_next;
 
-  char m_static_buf[CEPH_LOG_ENTRY_PREALLOC];
   PrebufferedStreambuf m_streambuf;
+  size_t m_buf_len;
+  size_t* m_exp_len;
+  char m_static_buf[1];
 
   Entry()
     : m_thread(0), m_prio(0), m_subsys(0),
       m_next(NULL),
-      m_streambuf(m_static_buf, sizeof(m_static_buf))
+      m_streambuf(m_static_buf, sizeof(m_static_buf)),
+      m_buf_len(sizeof(m_static_buf)),
+      m_exp_len(NULL)
   {}
   Entry(utime_t s, pthread_t t, short pr, short sub,
+  const char *msg = NULL)
+      : m_stamp(s), m_thread(t), m_prio(pr), m_subsys(sub),
+        m_next(NULL),
+        m_streambuf(m_static_buf, sizeof(m_static_buf)),
+        m_buf_len(sizeof(m_static_buf)),
+        m_exp_len(NULL)
+    {
+      if (msg) {
+        ostream os(&m_streambuf);
+        os << msg;
+      }
+    }
+  Entry(utime_t s, pthread_t t, short pr, short sub, char* buf, size_t buf_len, size_t* exp_len,
 	const char *msg = NULL)
     : m_stamp(s), m_thread(t), m_prio(pr), m_subsys(sub),
       m_next(NULL),
-      m_streambuf(m_static_buf, sizeof(m_static_buf))
+      m_streambuf(buf, buf_len),
+      m_buf_len(buf_len),
+      m_exp_len(exp_len)
   {
     if (msg) {
       ostream os(&m_streambuf);
@@ -40,6 +58,21 @@ struct Entry {
     }
   }
 
+  // function improves estimate for expected size of message
+  void hint_size() {
+    if (m_exp_len != NULL) {
+      size_t size = m_streambuf.size();
+      if (size > __atomic_load_n(m_exp_len, __ATOMIC_RELAXED)) {
+        //log larger then expected, just expand
+        __atomic_store_n(m_exp_len, size + 10, __ATOMIC_RELAXED);
+      }
+      else {
+        //asymptotically adapt expected size to message size
+        __atomic_store_n(m_exp_len, (size + 10 + m_buf_len*31) / 32, __ATOMIC_RELAXED);
+      }
+    }
+  }
+
   void set_str(const std::string &s) {
     ostream os(&m_streambuf);
     os << s;
@@ -48,6 +81,16 @@ struct Entry {
   std::string get_str() const {
     return m_streambuf.get_str();
   }
+
+  // returns current size of content
+  size_t size() const {
+    return m_streambuf.size();
+  }
+
+  // extracts up to avail chars of content
+  int snprintf(char* dst, size_t avail) const {
+    return m_streambuf.snprintf(dst, avail);
+  }
 };
 
 }
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 3dc6c63..860b2c7 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -21,6 +21,7 @@
 
 #define PREALLOC 1000000
 
+
 namespace ceph {
 namespace log {
 
@@ -167,6 +168,7 @@ void Log::submit_entry(Entry *e)
   pthread_mutex_unlock(&m_queue_mutex);
 }
 
+
 Entry *Log::create_entry(int level, int subsys)
 {
   if (true) {
@@ -184,6 +186,25 @@ Entry *Log::create_entry(int level, int subsys)
   }
 }
 
+Entry *Log::create_entry(int level, int subsys, size_t* expected_size)
+{
+  if (true) {
+    size_t size = __atomic_load_n(expected_size, __ATOMIC_RELAXED);
+    void *ptr = ::operator new(sizeof(Entry) + size);
+    return new(ptr) Entry(ceph_clock_now(NULL),
+       pthread_self(), level, subsys,
+       reinterpret_cast<char*>(ptr) + sizeof(Entry), size, expected_size);
+  } else {
+    // kludge for perf testing
+    Entry *e = m_recent.dequeue();
+    e->m_stamp = ceph_clock_now(NULL);
+    e->m_thread = pthread_self();
+    e->m_prio = level;
+    e->m_subsys = subsys;
+    return e;
+  }
+}
+
 void Log::flush()
 {
   pthread_mutex_lock(&m_flush_mutex);
@@ -209,7 +230,6 @@ void Log::flush()
 void Log::_flush(EntryQueue *t, EntryQueue *requeue, bool crash)
 {
   Entry *e;
-  char buf[80];
   while ((e = t->dequeue()) != NULL) {
     unsigned sub = e->m_subsys;
 
@@ -218,8 +238,10 @@ void Log::_flush(EntryQueue *t, EntryQueue *requeue, bool crash)
     bool do_syslog = m_syslog_crash >= e->m_prio && should_log;
     bool do_stderr = m_stderr_crash >= e->m_prio && should_log;
 
+    e->hint_size();
     if (do_fd || do_syslog || do_stderr) {
-      int buflen = 0;
+      size_t buflen = 0;
+      char buf[80 + e->size()];
 
       if (crash)
 	buflen += snprintf(buf, sizeof(buf), "%6d> ", -t->m_len);
@@ -227,25 +249,24 @@ void Log::_flush(EntryQueue *t, EntryQueue *requeue, bool crash)
       buflen += snprintf(buf + buflen, sizeof(buf)-buflen, " %lx %2d ",
 			(unsigned long)e->m_thread, e->m_prio);
 
-      // FIXME: this is slow
-      string s = e->get_str();
-
-      if (do_fd) {
-	int r = safe_write(m_fd, buf, buflen);
-	if (r >= 0)
-	  r = safe_write(m_fd, s.data(), s.size());
-	if (r >= 0)
-	  r = write(m_fd, "\n", 1);
-	if (r < 0)
-	  cerr << "problem writing to " << m_log_file << ": " << cpp_strerror(r) << std::endl;
+      buflen += e->snprintf(buf + buflen, sizeof(buf) - buflen - 1);
+      if (buflen > sizeof(buf) - 1) { //paranoid check, buf was declared to hold everything
+        buflen = sizeof(buf) - 1;
+        buf[buflen] = 0;
       }
 
       if (do_syslog) {
-	syslog(LOG_USER, "%s%s", buf, s.c_str());
+        syslog(LOG_USER|LOG_DEBUG, "%s", buf);
       }
 
       if (do_stderr) {
-	cerr << buf << s << std::endl;
+        cerr << buf << std::endl;
+      }
+      if (do_fd) {
+        buf[buflen] = '\n';
+        int r = safe_write(m_fd, buf, buflen+1);
+        if (r < 0)
+          cerr << "problem writing to " << m_log_file << ": " << cpp_strerror(r) << std::endl;
       }
     }
 
@@ -263,7 +284,7 @@ void Log::_log_message(const char *s, bool crash)
       cerr << "problem writing to " << m_log_file << ": " << cpp_strerror(r) << std::endl;
   }
   if ((crash ? m_syslog_crash : m_syslog_log) >= 0) {
-    syslog(LOG_USER, "%s", s);
+    syslog(LOG_USER|LOG_DEBUG, "%s", s);
   }
   
   if ((crash ? m_stderr_crash : m_stderr_log) >= 0) {
@@ -288,7 +309,7 @@ void Log::dump_recent()
 
   EntryQueue old;
   _log_message("--- begin dump of recent events ---", true);
-  _flush(&m_recent, &old, true);  
+  _flush(&m_recent, &old, true);
 
   char buf[4096];
   _log_message("--- logging levels ---", true);
diff --git a/src/log/Log.h b/src/log/Log.h
index 04cadd7..57727d3 100644
--- a/src/log/Log.h
+++ b/src/log/Log.h
@@ -69,6 +69,7 @@ public:
   void set_stderr_level(int log, int crash);
 
   Entry *create_entry(int level, int subsys);
+  Entry *create_entry(int level, int subsys, size_t* expected_size);
   void submit_entry(Entry *e);
 
   void start();
diff --git a/src/log/test.cc b/src/log/test.cc
index 6e4704b..a2df608 100644
--- a/src/log/test.cc
+++ b/src/log/test.cc
@@ -160,9 +160,10 @@ TEST(Log, ManyGatherLogPrebufOverflow)
     int l = 10;
     if (subs.should_gather(1, l)) {
       Entry *e = new Entry(ceph_clock_now(NULL), pthread_self(), l, 1);
-      PrebufferedStreambuf psb(e->m_static_buf, 20);
+      PrebufferedStreambuf psb(e->m_static_buf, sizeof(e->m_static_buf));
       ostream oss(&psb);
-      oss << "this i a long stream asdf asdf asdf asdf asdf asdf asdf asdf asdf as fd";
+      oss << "this i a long stream asdf asdf asdf asdf asdf asdf asdf asdf asdf as fd"
+          << std::string(sizeof(e->m_static_buf) * 2, '-') ;
       //e->m_str = oss.str();
       log.submit_entry(e);
     }
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 171fb3b..7bf21d6 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -21,7 +21,7 @@
 #include <set>
 
 #include "include/types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/lru.h"
 #include "include/elist.h"
 #include "include/filepath.h"
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index e161d5b..44034b8 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -18,7 +18,7 @@
 #define CEPH_CDIR_H
 
 #include "include/types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "mdstypes.h"
 #include "common/config.h"
 #include "common/DecayCounter.h"
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index c3ee1e0..1c95a0d 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -2695,7 +2695,7 @@ void CInode::choose_lock_state(SimpleLock *lock, int allissued)
     if (lock->is_xlocked()) {
       // do nothing here
     } else if (lock->get_state() != LOCK_MIX) {
-      if (issued & CEPH_CAP_GEXCL)
+      if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
 	lock->set_state(LOCK_EXCL);
       else if (issued & CEPH_CAP_GWR)
 	lock->set_state(LOCK_MIX);
@@ -2714,9 +2714,9 @@ void CInode::choose_lock_state(SimpleLock *lock, int allissued)
   }
 }
  
-void CInode::choose_lock_states()
+void CInode::choose_lock_states(int dirty_caps)
 {
-  int issued = get_caps_issued();
+  int issued = get_caps_issued() | dirty_caps;
   if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)) &&
       choose_ideal_loner() >= 0)
     try_set_loner();
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index f032e5f..15fef42 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -895,7 +895,7 @@ public:
 
   // choose new lock state during recovery, based on issued caps
   void choose_lock_state(SimpleLock *lock, int allissued);
-  void choose_lock_states();
+  void choose_lock_states(int dirty_caps);
 
   int count_nonstale_caps() {
     int n = 0;
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 55922ab..7f1b5b8 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -16,7 +16,7 @@
 #ifndef CEPH_CAPABILITY_H
 #define CEPH_CAPABILITY_H
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/xlist.h"
 
 #include "common/config.h"
diff --git a/src/mds/InoTable.h b/src/mds/InoTable.h
index e63f3f6..02656b0 100644
--- a/src/mds/InoTable.h
+++ b/src/mds/InoTable.h
@@ -83,6 +83,27 @@ class InoTable : public MDSTable {
       return false;
     }
   }
+
+  /**
+   * If this ino is in this rank's range, consume up to and including it.
+   * For use in tools, when we know the max ino in use and want to make
+   * sure we're only allocating new inodes from above it.
+   *
+   * @return true if the table was modified
+   */
+  bool force_consume_to(inodeno_t ino)
+  {
+    if (free.contains(ino)) {
+      inodeno_t min = free.begin().get_start();
+      std::cerr << "Erasing 0x" << std::hex << min << " to 0x" << ino << std::dec << std::endl;
+      free.erase(min, ino - min + 1);
+      projected_free = free;
+      projected_version = ++version;
+      return true;
+    } else {
+      return false;
+    }
+  }
 };
 
 #endif
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 0cbb5e3..eb83299 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2463,6 +2463,11 @@ void Locker::handle_client_caps(MClientCaps *m)
 	  << " op " << ceph_cap_op_name(m->get_op()) << dendl;
 
   if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    if (mds->is_reconnect() &&
+	m->get_dirty() && m->get_client_tid() > 0 &&
+	session->have_completed_flush(m->get_client_tid())) {
+      mdcache->set_reconnect_dirty_caps(m->get_ino(), m->get_dirty());
+    }
     mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
     return;
   }
diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h
index b3fabf4..26bf279 100644
--- a/src/mds/LogEvent.h
+++ b/src/mds/LogEvent.h
@@ -42,7 +42,7 @@
 #define EVENT_NOOP        51
 
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/Context.h"
 #include "include/utime.h"
 
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 4f99f7d..eddfcd4 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5375,7 +5375,11 @@ void MDCache::choose_lock_states_and_reconnect_caps()
     if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
       in->mark_dirty_rstat();
 
-    in->choose_lock_states();
+    int dirty_caps = 0;
+    map<inodeno_t, int>::iterator it = cap_imports_dirty.find(in->ino());
+    if (it != cap_imports_dirty.end())
+      dirty_caps = it->second;
+    in->choose_lock_states(dirty_caps);
     dout(15) << " chose lock states on " << *in << dendl;
 
     SnapRealm *realm = in->find_snaprealm();
@@ -5521,6 +5525,7 @@ void MDCache::export_remaining_imported_caps()
   }
 
   cap_imports.clear();
+  cap_imports_dirty.clear();
 
   if (warn_str.peek() != EOF) {
     mds->clog->warn() << "failed to reconnect caps for missing inodes:" << "\n";
@@ -5543,7 +5548,11 @@ void MDCache::try_reconnect_cap(CInode *in, Session *session)
     if (in->is_replicated()) {
       mds->locker->try_eval(in, CEPH_CAP_LOCKS);
     } else {
-      in->choose_lock_states();
+      int dirty_caps = 0;
+      map<inodeno_t, int>::iterator it = cap_imports_dirty.find(in->ino());
+      if (it != cap_imports_dirty.end())
+	dirty_caps = it->second;
+      in->choose_lock_states(dirty_caps);
       dout(15) << " chose lock states on " << *in << dendl;
     }
   }
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 93eb697..921b03a 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -491,7 +491,7 @@ protected:
   map<inodeno_t,mds_rank_t> cap_export_targets; // ino -> auth mds
 
   map<inodeno_t,map<client_t,map<mds_rank_t,ceph_mds_cap_reconnect> > > cap_imports;  // ino -> client -> frommds -> capex
-  map<inodeno_t,filepath> cap_import_paths;
+  map<inodeno_t,int> cap_imports_dirty;
   set<inodeno_t> cap_imports_missing;
   int cap_imports_num_opening;
   
@@ -534,7 +534,6 @@ public:
   void rejoin_recovered_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr, 
 			     mds_rank_t frommds=MDS_RANK_NONE) {
     cap_imports[ino][client][frommds] = icr.capinfo;
-    cap_import_paths[ino] = filepath(icr.path, (uint64_t)icr.capinfo.pathbase);
   }
   ceph_mds_cap_reconnect *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
     if (cap_imports.count(ino) &&
@@ -549,6 +548,9 @@ public:
     assert(cap_imports[ino][client].size() == 1);
     cap_imports.erase(ino);
   }
+  void set_reconnect_dirty_caps(inodeno_t ino, int dirty) {
+    cap_imports_dirty[ino] |= dirty;
+  }
 
   // [reconnect/rejoin caps]
   map<CInode*,map<client_t, inodeno_t> >  reconnected_caps;   // inode -> client -> realmino
diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc
index 42d01af..ccabe00 100644
--- a/src/mds/MDSAuthCaps.cc
+++ b/src/mds/MDSAuthCaps.cc
@@ -70,11 +70,13 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
 
     // capspec = * | r[w]
     capspec = spaces >> (
-        lit("*")[_val = MDSCapSpec(true, true, true)]
+        lit("*")[_val = MDSCapSpec(true, true, true, true)]
         |
-        (lit("rw"))[_val = MDSCapSpec(true, true, false)]
+        (lit("rwp"))[_val = MDSCapSpec(true, true, false, true)]
         |
-        (lit("r"))[_val = MDSCapSpec(true, false, false)]
+        (lit("rw"))[_val = MDSCapSpec(true, true, false, false)]
+        |
+        (lit("r"))[_val = MDSCapSpec(true, false, false, false)]
         );
 
     grant = lit("allow") >> (capspec >> match)[_val = phoenix::construct<MDSCapGrant>(_1, _2)];
@@ -159,6 +161,13 @@ bool MDSAuthCaps::is_capable(const std::string &inode_path,
     if (i->match.match(inode_path, caller_uid, caller_gid) &&
 	i->spec.allows(mask & (MAY_READ|MAY_EXECUTE), mask & MAY_WRITE)) {
 
+      // Spec is non-allowing if caller asked for set pool but spec forbids it
+      if (mask & MAY_SET_POOL) {
+        if (!i->spec.allows_set_pool()) {
+          continue;
+        }
+      }
+
       // check unix permissions?
       if (i->match.uid == MDSCapMatch::MDS_AUTH_UID_ANY) {
         return true;
@@ -209,7 +218,9 @@ bool MDSAuthCaps::is_capable(const std::string &inode_path,
 void MDSAuthCaps::set_allow_all()
 {
     grants.clear();
-    grants.push_back(MDSCapGrant(MDSCapSpec(true, true, true), MDSCapMatch()));
+    grants.push_back(MDSCapGrant(
+                       MDSCapSpec(true, true, true, true),
+                       MDSCapMatch()));
 }
 
 bool MDSAuthCaps::parse(CephContext *c, const std::string& str, ostream *err)
@@ -217,7 +228,7 @@ bool MDSAuthCaps::parse(CephContext *c, const std::string& str, ostream *err)
   // Special case for legacy caps
   if (str == "allow") {
     grants.clear();
-    grants.push_back(MDSCapGrant(MDSCapSpec(true, true, false), MDSCapMatch()));
+    grants.push_back(MDSCapGrant(MDSCapSpec(true, true, false, true), MDSCapMatch()));
     return true;
   }
 
diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h
index 112a7fb..e75e7e7 100644
--- a/src/mds/MDSAuthCaps.h
+++ b/src/mds/MDSAuthCaps.h
@@ -28,7 +28,8 @@ enum {
   MAY_WRITE = 2,
   MAY_EXECUTE = 4,
   MAY_CHOWN = 16,
-  MAY_CHGRP = 32
+  MAY_CHGRP = 32,
+  MAY_SET_POOL = 64,
 };
 
 class CephContext;
@@ -37,12 +38,17 @@ class CephContext;
 struct MDSCapSpec {
   bool read, write, any;
 
-  MDSCapSpec() : read(false), write(false), any(false) {}
-  MDSCapSpec(bool r, bool w, bool a) : read(r), write(w), any(a) {}
+  // True if the capability permits modifying the pool on file layouts
+  bool layout_pool;
+
+  MDSCapSpec() : read(false), write(false), any(false), layout_pool(false) {}
+  MDSCapSpec(bool r, bool w, bool a, bool lop)
+    : read(r), write(w), any(a), layout_pool(lop) {}
 
   bool allow_all() const {
     return any;
   }
+
   bool allows(bool r, bool w) const {
     if (any)
       return true;
@@ -52,6 +58,10 @@ struct MDSCapSpec {
       return false;
     return true;
   }
+
+  bool allows_set_pool() const {
+    return layout_pool;
+  }
 };
 
 // conditions before we are allowed to do it
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index 9fcff34..4e1f0ff 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -571,6 +571,21 @@ void MDSDaemon::handle_command(MCommand *m)
     r = _handle_command(cmdmap, m->get_data(), &outbl, &outs, &run_after);
   }
 
+  // If someone is using a closed session for sending commands (e.g.
+  // the ceph CLI) then we should feel free to clean up this connection
+  // as soon as we've sent them a response.
+  const bool live_session = mds_rank &&
+    mds_rank->sessionmap.get_session(session->info.inst.name) != nullptr
+    && session->get_state_seq() > 0;
+
+  if (!live_session) {
+    // This session only existed to issue commands, so terminate it
+    // as soon as we can.
+    assert(session->is_closed());
+    session->connection->mark_disposable();
+    session->put();
+  }
+
   MCommandReply *reply = new MCommandReply(r, outs);
   reply->set_tid(m->get_tid());
   reply->set_data(outbl);
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 796aad3..73c2df9 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -638,3 +638,55 @@ void MDSMap::decode(bufferlist::iterator& p)
   }
   DECODE_FINISH(p);
 }
+
+MDSMap::availability_t MDSMap::is_cluster_available() const
+{
+  if (epoch == 0) {
+    // This is ambiguous between "mds map was never initialized on mons" and
+    // "we never got an mdsmap from the mons".  Treat it like the latter.
+    return TRANSIENT_UNAVAILABLE;
+  }
+
+
+  // If a rank is marked damage (unavailable until operator intervenes)
+  if (damaged.size()) {
+    return STUCK_UNAVAILABLE;
+  }
+
+  // If no ranks are created (filesystem not initialized)
+  if (in.empty()) {
+    return STUCK_UNAVAILABLE;
+  }
+
+  for (const auto rank : in) {
+    std::string name;
+    if (up.count(rank) != 0) {
+      name = mds_info.at(up.at(rank)).name;
+    }
+    const mds_gid_t replacement = find_replacement_for(rank, name, false);
+    const bool standby_avail = (replacement != MDS_GID_NONE);
+
+    // If the rank is unfilled, and there are no standbys, we're unavailable
+    if (up.count(rank) == 0 && !standby_avail) {
+      return STUCK_UNAVAILABLE;
+    } else if (up.count(rank) && mds_info.at(up.at(rank)).laggy() && !standby_avail) {
+      // If the daemon is laggy and there are no standbys, we're unavailable.
+      // It would be nice to give it some grace here, but to do so callers
+      // would have to poll this time-wise, vs. just waiting for updates
+      // to mdsmap, so it's not worth the complexity.
+      return STUCK_UNAVAILABLE;
+    }
+  }
+
+  if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
+    // Nobody looks stuck, so indicate to client they should go ahead
+    // and try mounting if anybody is active.  This may include e.g.
+    // one MDS failing over and another active: the client should
+    // proceed to start talking to the active one and let the
+    // transiently-unavailable guy catch up later.
+    return AVAILABLE;
+  } else {
+    // Nothing indicating we were stuck, but nobody active (yet)
+    return TRANSIENT_UNAVAILABLE;
+  }
+}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index f4b369b..16249b0 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -409,7 +409,7 @@ public:
     return NULL;
   }
 
-  mds_gid_t find_standby_for(mds_rank_t mds, std::string& name) {
+  mds_gid_t find_standby_for(mds_rank_t mds, std::string& name) const {
     std::map<mds_gid_t, mds_info_t>::const_iterator generic_standby
       = mds_info.end();
     for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
@@ -429,7 +429,8 @@ public:
     return MDS_GID_NONE;
   }
 
-  mds_gid_t find_unused_for(mds_rank_t mds, std::string& name) const {
+  mds_gid_t find_unused_for(mds_rank_t mds, std::string& name,
+                            bool force_standby_active) const {
     for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
          p != mds_info.end();
          ++p) {
@@ -439,24 +440,50 @@ public:
         continue;
       if ((p->second.standby_for_rank == MDS_NO_STANDBY_PREF ||
            p->second.standby_for_rank == MDS_MATCHED_ACTIVE ||
-           (p->second.standby_for_rank == MDS_STANDBY_ANY && g_conf->mon_force_standby_active))) {
+           (p->second.standby_for_rank == MDS_STANDBY_ANY && force_standby_active))) {
         return p->first;
       }
     }
     return MDS_GID_NONE;
   }
 
-  mds_gid_t find_replacement_for(mds_rank_t mds, std::string& name) {
+  mds_gid_t find_replacement_for(mds_rank_t mds, std::string& name,
+                                 bool force_standby_active) const {
     const mds_gid_t standby = find_standby_for(mds, name);
     if (standby)
       return standby;
     else
-      return find_unused_for(mds, name);
+      return find_unused_for(mds, name, force_standby_active);
   }
 
   void get_health(list<pair<health_status_t,std::string> >& summary,
 		  list<pair<health_status_t,std::string> > *detail) const;
 
+  typedef enum
+  {
+    AVAILABLE = 0,
+    TRANSIENT_UNAVAILABLE = 1,
+    STUCK_UNAVAILABLE = 2
+
+  } availability_t;
+
+  /**
+   * Return indication of whether cluster is available.  This is a
+   * heuristic for clients to see if they should bother waiting to talk to
+   * MDSs, or whether they should error out at startup/mount.
+   *
+   * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a
+   * transition state like replaying, or is potentially about the fail over.
+   * Clients should wait for an updated map before making a final decision
+   * about whether the filesystem is mountable.
+   *
+   * A STUCK_UNAVAILABLE result indicates that we can't see a way that
+   * the cluster is about to recover on its own, so it'll probably require
+   * administrator intervention: clients should probaly not bother trying
+   * to mount.
+   */
+  availability_t is_cluster_available() const;
+
   // mds states
   bool is_down(mds_rank_t m) const { return up.count(m) == 0; }
   bool is_up(mds_rank_t m) const { return up.count(m); }
diff --git a/src/mds/MDSTable.h b/src/mds/MDSTable.h
index 3eeb8a2..904ecfd 100644
--- a/src/mds/MDSTable.h
+++ b/src/mds/MDSTable.h
@@ -17,7 +17,7 @@
 
 #include "mdstypes.h"
 #include "mds_table_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 
 class MDSRank;
 class Context;
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index dbba8c6..3887faac 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -3063,6 +3063,9 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   else
     layout = mdcache->default_file_layout;
 
+  // What kind of client caps are required to complete this operation
+  uint64_t access = MAY_WRITE;
+
   // fill in any special params from client
   if (req->head.args.open.stripe_unit)
     layout.fl_stripe_unit = req->head.args.open.stripe_unit;
@@ -3074,6 +3077,17 @@ void Server::handle_client_openc(MDRequestRef& mdr)
       (__s32)req->head.args.open.pool >= 0) {
     layout.fl_pg_pool = req->head.args.open.pool;
 
+    // If client doesn't have capability to modify layout pools, then
+    // only permit this request if the requested pool matches what the
+    // file would have inherited anyway from its parent.
+    CDir *parent = dn->get_dir();
+    CInode *parent_in = parent->get_inode();
+    int64_t parent_pool = parent_in->inode.layout.fl_pg_pool;
+
+    if (layout.fl_pg_pool != parent_pool) {
+      access |= MAY_SET_POOL;
+    }
+
     // make sure we have as new a map as the client
     if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
       mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
@@ -3097,7 +3111,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
-  if (!check_access(mdr, diri, MAY_WRITE))
+  if (!check_access(mdr, diri, access))
     return;
 
   CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -3778,6 +3792,8 @@ void Server::handle_client_setlayout(MDRequestRef& mdr)
   // save existing layout for later
   int64_t old_pool = layout.fl_pg_pool;
 
+  int access = MAY_WRITE;
+
   if (req->head.args.setlayout.layout.fl_object_size > 0)
     layout.fl_object_size = req->head.args.setlayout.layout.fl_object_size;
   if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
@@ -3791,6 +3807,10 @@ void Server::handle_client_setlayout(MDRequestRef& mdr)
   if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
     layout.fl_pg_pool = req->head.args.setlayout.layout.fl_pg_pool;
 
+    if (layout.fl_pg_pool != old_pool) {
+      access |= MAY_SET_POOL;
+    }
+
     // make sure we have as new a map as the client
     if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
       mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
@@ -3812,7 +3832,7 @@ void Server::handle_client_setlayout(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
-  if (!check_access(mdr, cur, MAY_WRITE))
+  if (!check_access(mdr, cur, access))
     return;
 
   // project update
@@ -3856,9 +3876,6 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
-  if (!check_access(mdr, cur, MAY_WRITE))
-    return;
-
   // validate layout
   const inode_t *old_pi = cur->get_projected_inode();
   ceph_file_layout layout;
@@ -3869,6 +3886,9 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
   else
     layout = mdcache->default_file_layout;
 
+  // Level of access required to complete
+  int access = MAY_WRITE;
+
   if (req->head.args.setlayout.layout.fl_object_size > 0)
     layout.fl_object_size = req->head.args.setlayout.layout.fl_object_size;
   if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
@@ -3880,6 +3900,9 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
   if (req->head.args.setlayout.layout.fl_object_stripe_unit > 0)
     layout.fl_object_stripe_unit = req->head.args.setlayout.layout.fl_object_stripe_unit;
   if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+    if (req->head.args.setlayout.layout.fl_pg_pool != layout.fl_pg_pool) {
+      access |= MAY_SET_POOL;
+    }
     layout.fl_pg_pool = req->head.args.setlayout.layout.fl_pg_pool;
     // make sure we have as new a map as the client
     if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
@@ -3898,6 +3921,9 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
     return;
   }
 
+  if (!check_access(mdr, cur, access))
+    return;
+
   inode_t *pi = cur->project_inode();
   pi->layout = layout;
   pi->version = cur->pre_dirty();
@@ -4086,6 +4112,12 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
       if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
 	return;
 
+      if (cur->inode.layout.fl_pg_pool != layout.fl_pg_pool) {
+        if (!check_access(mdr, cur, MAY_SET_POOL)) {
+          return;
+        }
+      }
+
       pi = cur->project_inode();
       pi->layout = layout;
     } else if (name.find("ceph.file.layout") == 0) {
@@ -4129,6 +4161,12 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
       if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
 	return;
 
+      if (cur->inode.layout.fl_pg_pool != layout.fl_pg_pool) {
+        if (!check_access(mdr, cur, MAY_SET_POOL)) {
+          return;
+        }
+      }
+
       pi = cur->project_inode();
       int64_t old_pool = pi->layout.fl_pg_pool;
       pi->add_old_pool(old_pool);
diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h
index 259baa5..1a9e647 100755
--- a/src/messages/MOSDOp.h
+++ b/src/messages/MOSDOp.h
@@ -344,7 +344,14 @@ struct ceph_osd_request_head {
     assert(partial_decode_needed && final_decode_needed);
     p = payload.begin();
 
-    if (header.version < 2) {
+    // Always keep here the newest version of decoding order/rule
+    if (header.version == HEAD_VERSION) {
+	  ::decode(pgid, p);
+	  ::decode(osdmap_epoch, p);
+	  ::decode(flags, p);
+	  ::decode(reassert_version, p);
+	  ::decode(reqid, p);
+    } else if (header.version < 2) {
       // old decode
       ::decode(client_inc, p);
 
@@ -446,13 +453,6 @@ struct ceph_osd_request_head {
       // put client_inc in reqid.inc for get_reqid()'s benefit
       if (reqid.name == entity_name_t() && reqid.tid == 0)
 	reqid.inc = client_inc;
-    } else {
-      // new, v7 decode, splitted to partial and final
-      ::decode(pgid, p);
-      ::decode(osdmap_epoch, p);
-      ::decode(flags, p);
-      ::decode(reassert_version, p);
-      ::decode(reqid, p);
     }
 
     partial_decode_needed = false;
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index 087f165..eee12b5 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -194,7 +194,32 @@ public:
   }
   virtual void decode_payload() {
     bufferlist::iterator p = payload.begin();
-    if (header.version < 2) {
+
+    // Always keep here the newest version of decoding order/rule
+    if (header.version == HEAD_VERSION) {
+	::decode(oid, p);
+	::decode(pgid, p);
+	::decode(flags, p);
+	::decode(result, p);
+	::decode(bad_replay_version, p);
+	::decode(osdmap_epoch, p);
+
+	__u32 num_ops = ops.size();
+	::decode(num_ops, p);
+	ops.resize(num_ops);
+	for (unsigned i = 0; i < num_ops; i++)
+	::decode(ops[i].op, p);
+	::decode(retry_attempt, p);
+
+	for (unsigned i = 0; i < num_ops; ++i)
+	::decode(ops[i].rval, p);
+
+	OSDOp::split_osd_op_vector_out_data(ops, data);
+
+	::decode(replay_version, p);
+	::decode(user_version, p);
+	::decode(redirect, p);
+    } else if (header.version < 2) {
       ceph_osd_reply_head head;
       ::decode(head, p);
       ops.resize(head.num_ops);
diff --git a/src/messages/MOSDRepOp.h b/src/messages/MOSDRepOp.h
index a4b0883..f80405a 100644
--- a/src/messages/MOSDRepOp.h
+++ b/src/messages/MOSDRepOp.h
@@ -34,9 +34,14 @@ public:
   // metadata from original request
   osd_reqid_t reqid;
 
+  spg_t pgid;
+
+  bufferlist::iterator p;
+  // Decoding flags. Decoding is only needed for messages catched by pipe reader.
+  bool final_decode_needed;
+
   // subop
   pg_shard_t from;
-  spg_t pgid;
   hobject_t poid;
 
   __u8 acks_wanted;
@@ -64,10 +69,16 @@ public:
   }
 
   virtual void decode_payload() {
-    bufferlist::iterator p = payload.begin();
+    p = payload.begin();
+    // splitted to partial and final
     ::decode(map_epoch, p);
     ::decode(reqid, p);
     ::decode(pgid, p);
+  }
+
+  void finish_decode() {
+    if (!final_decode_needed)
+      return; // Message is already final decoded
     ::decode(poid, p);
 
     ::decode(acks_wanted, p);
@@ -83,6 +94,7 @@ public:
     ::decode(from, p);
     ::decode(updated_hit_set_history, p);
     ::decode(pg_trim_rollback_to, p);
+    final_decode_needed = false;
   }
 
   virtual void encode_payload(uint64_t features) {
@@ -105,15 +117,17 @@ public:
 
   MOSDRepOp()
     : Message(MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION),
-      map_epoch(0), acks_wanted (0) {}
+      map_epoch(0),
+      final_decode_needed(true), acks_wanted (0) {}
   MOSDRepOp(osd_reqid_t r, pg_shard_t from,
 	    spg_t p, const hobject_t& po, int aw,
 	    epoch_t mape, ceph_tid_t rtid, eversion_t v)
     : Message(MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION),
       map_epoch(mape),
       reqid(r),
-      from(from),
       pgid(p),
+      final_decode_needed(false),
+      from(from),
       poid(po),
       acks_wanted(aw),
       version(v) {
@@ -126,11 +140,12 @@ public:
   const char *get_type_name() const { return "osd_repop"; }
   void print(ostream& out) const {
     out << "osd_repop(" << reqid
-	<< " " << pgid
-	<< " " << poid;
-    out << " v " << version;
-    if (updated_hit_set_history)
-      out << ", has_updated_hit_set_history";
+          << " " << pgid;
+    if (!final_decode_needed) {
+        out << " " << poid << " v " << version;
+      if (updated_hit_set_history)
+        out << ", has_updated_hit_set_history";
+    }
     out << ")";
   }
 };
diff --git a/src/messages/MOSDRepOpReply.h b/src/messages/MOSDRepOpReply.h
index f0faa4c..1632ffb 100644
--- a/src/messages/MOSDRepOpReply.h
+++ b/src/messages/MOSDRepOpReply.h
@@ -46,18 +46,26 @@ public:
   // piggybacked osd state
   eversion_t last_complete_ondisk;
 
+  bufferlist::iterator p;
+  // Decoding flags. Decoding is only needed for messages catched by pipe reader.
+  bool final_decode_needed;
 
   virtual void decode_payload() {
-    bufferlist::iterator p = payload.begin();
+    p = payload.begin();
     ::decode(map_epoch, p);
     ::decode(reqid, p);
     ::decode(pgid, p);
+  }
 
+  void finish_decode() {
+    if (!final_decode_needed)
+      return; // Message is already final decoded
     ::decode(ack_type, p);
     ::decode(result, p);
     ::decode(last_complete_ondisk, p);
 
     ::decode(from, p);
+    final_decode_needed = false;
   }
   virtual void encode_payload(uint64_t features) {
     ::encode(map_epoch, payload);
@@ -91,12 +99,14 @@ public:
     from(from),
     pgid(req->pgid.pgid, req->from.shard),
     ack_type(at),
-    result(result_) {
+    result(result_),
+    final_decode_needed(false) {
     set_tid(req->get_tid());
   }
   MOSDRepOpReply() 
     : Message(MSG_OSD_REPOPREPLY), map_epoch(0),  
-      ack_type(0), result(0) {}
+      ack_type(0), result(0),
+      final_decode_needed(true) {}
 private:
   ~MOSDRepOpReply() {}
 
@@ -105,14 +115,16 @@ public:
 
   void print(ostream& out) const {
     out << "osd_repop_reply(" << reqid
-	<< " " << pgid;
-    if (ack_type & CEPH_OSD_FLAG_ONDISK)
-      out << " ondisk";
-    if (ack_type & CEPH_OSD_FLAG_ONNVRAM)
-      out << " onnvram";
-    if (ack_type & CEPH_OSD_FLAG_ACK)
-      out << " ack";
-    out << ", result = " << result;
+        << " " << pgid;
+    if (!final_decode_needed) {
+      if (ack_type & CEPH_OSD_FLAG_ONDISK)
+        out << " ondisk";
+      if (ack_type & CEPH_OSD_FLAG_ONNVRAM)
+        out << " onnvram";
+      if (ack_type & CEPH_OSD_FLAG_ACK)
+        out << " ack";
+      out << ", result = " << result;
+    }
     out << ")";
   }
 
diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h
index f746568..38c303c 100644
--- a/src/messages/MOSDSubOp.h
+++ b/src/messages/MOSDSubOp.h
@@ -172,6 +172,8 @@ public:
     }
   }
 
+  void finish_decode() { }
+
   virtual void encode_payload(uint64_t features) {
     ::encode(map_epoch, payload);
     ::encode(reqid, payload);
diff --git a/src/messages/MOSDSubOpReply.h b/src/messages/MOSDSubOpReply.h
index a084246..81d1b28 100644
--- a/src/messages/MOSDSubOpReply.h
+++ b/src/messages/MOSDSubOpReply.h
@@ -85,6 +85,9 @@ public:
       pgid.shard = shard_id_t::NO_SHARD;
     }
   }
+
+  void finish_decode() { }
+
   virtual void encode_payload(uint64_t features) {
     ::encode(map_epoch, payload);
     ::encode(reqid, payload);
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 2530628..664af7b 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -1906,7 +1906,8 @@ void MDSMonitor::tick()
     string name;
     while (pending_mdsmap.is_in(mds))
       mds++;
-    mds_gid_t newgid = pending_mdsmap.find_replacement_for(mds, name);
+    mds_gid_t newgid = pending_mdsmap.find_replacement_for(mds, name,
+                         g_conf->mon_force_standby_active);
     if (!newgid)
       break;
 
@@ -1976,7 +1977,8 @@ void MDSMonitor::tick()
       if (info.rank >= 0 &&
 	  info.state != MDSMap::STATE_STANDBY &&
 	  info.state != MDSMap::STATE_STANDBY_REPLAY &&
-	  (sgid = pending_mdsmap.find_replacement_for(info.rank, info.name)) != MDS_GID_NONE) {
+	  (sgid = pending_mdsmap.find_replacement_for(info.rank, info.name, 
+                    g_conf->mon_force_standby_active)) != MDS_GID_NONE) {
 	MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sgid];
 	dout(10) << " replacing " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
 		 << " " << ceph_mds_state_name(info.state)
@@ -2063,7 +2065,8 @@ void MDSMonitor::tick()
     while (p != failed.end()) {
       mds_rank_t f = *p++;
       string name;  // FIXME
-      mds_gid_t sgid = pending_mdsmap.find_replacement_for(f, name);
+      mds_gid_t sgid = pending_mdsmap.find_replacement_for(f, name,
+          g_conf->mon_force_standby_active);
       if (sgid) {
 	MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sgid];
 	dout(0) << " taking over failed mds." << f << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 67936d5..8d09f91 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -674,11 +674,11 @@ COMMAND("osd pool rename " \
 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
 	"name=val,type=CephString " \
 	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 040332c..3e7cd35 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1429,7 +1429,7 @@ bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
     int from = m->get_orig_source().num();
     if (!osdmap.exists(from) ||
 	osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
-	osdmap.is_down(from)) {
+	(osdmap.is_down(from) && m->if_osd_failed())) {
       dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
       send_incremental(op, m->get_epoch()+1);
       goto didit;
@@ -1635,6 +1635,8 @@ void OSDMonitor::check_failures(utime_t now)
 
 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
 {
+  set<string> reporters_by_subtree;
+  string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
   utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
   utime_t max_failed_since = fi.get_failed_since();
   utime_t failed_for = now - max_failed_since;
@@ -1663,6 +1665,16 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
     for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
 	 p != fi.reporters.end();
 	 ++p) {
+      // get the parent bucket whose type matches with "reporter_subtree_level".
+      // fall back to OSD if the level doesn't exist.
+      map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
+      map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
+      if (iter == reporter_loc.end()) {
+	reporters_by_subtree.insert("osd." + to_string(p->first));
+      } else {
+	reporters_by_subtree.insert(iter->second);
+      }
+
       const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
       utime_t elapsed = now - xi.down_stamp;
       double decay = exp((double)elapsed * decay_k);
@@ -1685,15 +1697,17 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
     return true;
   }
 
+
   if (failed_for >= grace &&
-      ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters)) {
+      (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
     dout(1) << " we have enough reporters to mark osd." << target_osd
 	    << " down" << dendl;
     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
 
     mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
-		     << (int)fi.reporters.size() << " reporters after "
-		     << failed_for << " >= grace " << grace << ")\n";
+		      << (int)reporters_by_subtree.size() << " reporters from different "
+		      << reporter_subtree_level << " after "
+		      << failed_for << " >= grace " << grace << ")\n";
     return true;
   }
   return false;
@@ -2907,7 +2921,9 @@ namespace {
     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
-    HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N};
+    HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
+    SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
+    RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY};
 
   std::set<osd_pool_get_choices>
     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -3383,7 +3399,12 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       ("min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE)
       ("fast_read", FAST_READ)
       ("hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE)
-      ("hit_set_search_last_n", HIT_SET_SEARCH_LAST_N);
+      ("hit_set_search_last_n", HIT_SET_SEARCH_LAST_N)
+      ("scrub_min_interval", SCRUB_MIN_INTERVAL)
+      ("scrub_max_interval", SCRUB_MAX_INTERVAL)
+      ("deep_scrub_interval", DEEP_SCRUB_INTERVAL)
+      ("recovery_priority", RECOVERY_PRIORITY)
+      ("recovery_op_priority", RECOVERY_OP_PRIORITY);
 
     typedef std::set<osd_pool_get_choices> choices_set_t;
 
@@ -3562,6 +3583,18 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 	    f->dump_int("hit_set_search_last_n",
 			p->hit_set_search_last_n);
 	    break;
+	  case SCRUB_MIN_INTERVAL:
+	  case SCRUB_MAX_INTERVAL:
+	  case DEEP_SCRUB_INTERVAL:
+          case RECOVERY_PRIORITY:
+          case RECOVERY_OP_PRIORITY:
+	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+	      if (i->second == *it)
+		break;
+	    }
+	    assert(i != ALL_CHOICES.end());
+	    p->opts.dump(i->first, f.get());
+            break;
 	}
 	f->close_section();
 	f->flush(rdata);
@@ -3683,6 +3716,23 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
           case FAST_READ:
             ss << "fast_read: " << p->fast_read << "\n";
             break;
+	  case SCRUB_MIN_INTERVAL:
+	  case SCRUB_MAX_INTERVAL:
+	  case DEEP_SCRUB_INTERVAL:
+          case RECOVERY_PRIORITY:
+          case RECOVERY_OP_PRIORITY:
+	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+	      if (i->second == *it)
+		break;
+	    }
+	    assert(i != ALL_CHOICES.end());
+	    {
+	      pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+	      if (p->opts.is_set(key)) {
+		ss << i->first << ": " << p->opts.get(key) << "\n";
+	      }
+	    }
+	    break;
 	}
 	rdata.append(ss.str());
 	ss.str("");
@@ -5088,6 +5138,41 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     } else if (val == "false" || (interr.empty() && n == 0)) {
       p.fast_read = false;
     }
+  } else if (pool_opts_t::is_opt_name(var)) {
+    pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
+    switch (desc.type) {
+    case pool_opts_t::STR:
+      if (val.empty()) {
+	p.opts.unset(desc.key);
+      } else {
+	p.opts.set(desc.key, static_cast<std::string>(val));
+      }
+      break;
+    case pool_opts_t::INT:
+      if (interr.length()) {
+	ss << "error parsing integer value '" << val << "': " << interr;
+	return -EINVAL;
+      }
+      if (n == 0) {
+	p.opts.unset(desc.key);
+      } else {
+	p.opts.set(desc.key, static_cast<int>(n));
+      }
+      break;
+    case pool_opts_t::DOUBLE:
+      if (floaterr.length()) {
+	ss << "error parsing floating point value '" << val << "': " << floaterr;
+	return -EINVAL;
+      }
+      if (f == 0) {
+	p.opts.unset(desc.key);
+      } else {
+	p.opts.set(desc.key, static_cast<double>(f));
+      }
+      break;
+    default:
+      assert(!"unknown type");
+    }
   } else {
     ss << "unrecognized variable '" << var << "'";
     return -EINVAL;
diff --git a/src/mount/mtab.c b/src/mount/mtab.c
index d17f7a5..4dc82bb 100644
--- a/src/mount/mtab.c
+++ b/src/mount/mtab.c
@@ -289,4 +289,6 @@ update_mtab_entry(const char *spec, const char *node, const char *type,
 
 	free(mnt.mnt_fsname);
 	free(mnt.mnt_dir);
+	free(mnt.mnt_type);
+	free(mnt.mnt_opts);
 }
diff --git a/src/msg/Message.h b/src/msg/Message.h
index ddba0e5..a53f282 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -294,8 +294,7 @@ protected:
   virtual ~Message() {
     if (byte_throttler)
       byte_throttler->put(payload.length() + middle.length() + data.length());
-    if (msg_throttler)
-      msg_throttler->put();
+    release_message_throttle();
     /* call completion hooks (if any) */
     if (completion_hook)
       completion_hook->complete(0);
@@ -347,6 +346,11 @@ public:
     data.clear();
     clear_buffers(); // let subclass drop buffers as well
   }
+  void release_message_throttle() {
+    if (msg_throttler)
+      msg_throttler->put();
+    msg_throttler = nullptr;
+  }
 
   bool empty_payload() const { return payload.length() == 0; }
   bufferlist& get_payload() { return payload; }
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index 9e70eb7..8acfd68 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -547,6 +547,7 @@ public:
    * of one reference to it.
    */
   void ms_fast_dispatch(Message *m) {
+    m->set_dispatch_stamp(ceph_clock_now(cct));
     for (list<Dispatcher*>::iterator p = fast_dispatchers.begin();
 	 p != fast_dispatchers.end();
 	 ++p) {
diff --git a/src/ocf/Makefile.in b/src/ocf/Makefile.in
index 2c085f2..ac86dc8 100644
--- a/src/ocf/Makefile.in
+++ b/src/ocf/Makefile.in
@@ -187,6 +187,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -273,6 +274,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index 00ce46e..4b81acb 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -2,7 +2,7 @@
 #ifndef DBOBJECTMAP_DB_H
 #define DBOBJECTMAP_DB_H
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include <set>
 #include <map>
 #include <string>
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index 86d269b..34c2f44 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -1405,7 +1405,6 @@ void FileJournal::do_aio_write(bufferlist& bl)
  */
 int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
 {
-  Mutex::Locker locker(aio_lock);
   align_bl(pos, bl);
 
   dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
@@ -1427,6 +1426,9 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
     bufferlist tbl;
     bl.splice(0, len, &tbl);  // move bytes from bl -> tbl
 
+    // lock only aio_queue, current aio, aio_num, aio_bytes, which may be 
+    // modified in check_aio_completion
+    aio_lock.Lock();
     aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
     aio_info& aio = aio_queue.back();
     aio.iov = iov;
@@ -1438,13 +1440,20 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
 
     aio_num++;
     aio_bytes += aio.len;
+   
+    // need to save current aio len to update write_pos later because current
+    // aio could be ereased from aio_queue once it is done
+    uint64_t cur_len = aio.len;
+    // unlock aio_lock because following io_submit might take time to return
+    aio_lock.Unlock();
 
     iocb *piocb = &aio.iocb;
     int attempts = 10;
     do {
       int r = io_submit(aio_ctx, 1, &piocb);
+      dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
       if (r < 0) {
-	derr << "io_submit to " << aio.off << "~" << aio.len
+	derr << "io_submit to " << aio.off << "~" << cur_len
 	     << " got " << cpp_strerror(r) << dendl;
 	if (r == -EAGAIN && attempts-- > 0) {
 	  usleep(500);
@@ -1455,9 +1464,11 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
 	break;
       }
     } while (true);
-    pos += aio.len;
+    pos += cur_len;
   }
+  aio_lock.Lock();
   write_finish_cond.Signal();
+  aio_lock.Unlock();
   return 0;
 }
 #endif
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 3cfb13f..d58fa5f 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -123,13 +123,12 @@ static CompatSet get_fs_supported_compat_set() {
   return compat;
 }
 
-
-int FileStore::peek_journal_fsid(uuid_d *fsid)
+int FileStore::get_block_device_fsid(const string& path, uuid_d *fsid)
 {
   // make sure we don't try to use aio or direct_io (and get annoying
   // error messages from failing to do so); performance implications
   // should be irrelevant for this use
-  FileJournal j(*fsid, 0, 0, journalpath.c_str(), false, false);
+  FileJournal j(*fsid, 0, 0, path.c_str(), false, false);
   return j.peek_fsid(*fsid);
 }
 
@@ -927,6 +926,10 @@ int FileStore::mkfs()
   if (ret)
     goto close_fsid_fd;
 
+  ret = write_meta("type", "filestore");
+  if (ret)
+    goto close_fsid_fd;
+
   dout(1) << "mkfs done in " << basedir << dendl;
   ret = 0;
 
@@ -1849,7 +1852,6 @@ void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle)
   logger->set(l_os_oq_max_ops, max_ops);
   logger->set(l_os_oq_max_bytes, max_bytes);
 
-  utime_t start = ceph_clock_now(g_ceph_context);
   if (handle)
     handle->suspend_tp_timeout();
   if (throttle_ops.should_wait(1) || 
@@ -1863,9 +1865,6 @@ void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle)
   if (handle)
     handle->reset_tp_timeout();
 
-  utime_t end = ceph_clock_now(g_ceph_context);
-  logger->tinc(l_os_queue_lat, end - start);
-
   logger->set(l_os_oq_ops, throttle_ops.get_current());
   logger->set(l_os_oq_bytes, throttle_bytes.get_current());
 }
@@ -1960,6 +1959,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     return 0;
   }
 
+  utime_t start = ceph_clock_now(g_ceph_context);
   // set up the sequencer
   OpSequencer *osr;
   assert(posr);
@@ -2011,6 +2011,8 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
       assert(0);
     }
     submit_manager.op_submit_finish(op_num);
+    utime_t end = ceph_clock_now(g_ceph_context);
+    logger->tinc(l_os_queue_lat, end - start);
     return 0;
   }
 
@@ -2031,6 +2033,8 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     if (ondisk)
       apply_manager.add_waiter(op_num, ondisk);
     submit_manager.op_submit_finish(op_num);
+    utime_t end = ceph_clock_now(g_ceph_context);
+    logger->tinc(l_os_queue_lat, end - start);
     return 0;
   }
 
@@ -2066,6 +2070,8 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
   submit_manager.op_submit_finish(op);
   apply_manager.op_apply_finish(op);
 
+  utime_t end = ceph_clock_now(g_ceph_context);
+  logger->tinc(l_os_queue_lat, end - start);
   return r;
 }
 
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index c972ebe..be5e668 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -52,14 +52,14 @@ using namespace std;
 
 #if defined(__linux__)
 # ifndef BTRFS_SUPER_MAGIC
-static const __SWORD_TYPE BTRFS_SUPER_MAGIC(0x9123683E);
+#define BTRFS_SUPER_MAGIC 0x9123683E
 # endif
 # ifndef XFS_SUPER_MAGIC
-static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
+#define XFS_SUPER_MAGIC 0x58465342
+# endif
+# ifndef ZFS_SUPER_MAGIC
+#define ZFS_SUPER_MAGIC 0x2fc12fc1
 # endif
-#ifndef ZFS_SUPER_MAGIC
-static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
-#endif
 #endif
 
 
@@ -96,7 +96,7 @@ public:
     return target_version;
   }
 
-  int peek_journal_fsid(uuid_d *fsid);
+  static int get_block_device_fsid(const string& path, uuid_d *fsid);
 
   struct FSPerfTracker {
     PerfCounters::avg_tracker<uint64_t> os_commit_latency;
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index cacdbc8..6f5a22d 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -15,7 +15,7 @@
 #ifndef CEPH_HASHINDEX_H
 #define CEPH_HASHINDEX_H
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/encoding.h"
 #include "LFNIndex.h"
 
diff --git a/src/os/Journal.h b/src/os/Journal.h
index d5b9186..400b1ea 100644
--- a/src/os/Journal.h
+++ b/src/os/Journal.h
@@ -18,7 +18,7 @@
 
 #include <errno.h>
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/Context.h"
 #include "common/Finisher.h"
 #include "common/TrackedOp.h"
diff --git a/src/os/JournalingObjectStore.h b/src/os/JournalingObjectStore.h
index 42d13f6..bba3767 100644
--- a/src/os/JournalingObjectStore.h
+++ b/src/os/JournalingObjectStore.h
@@ -132,7 +132,7 @@ public:
   JournalingObjectStore(const std::string& path)
     : ObjectStore(path),
       journal(NULL),
-      finisher(g_ceph_context),
+      finisher(g_ceph_context, "JournalObjectStore"),
       apply_manager(journal, finisher),
       replaying(false) {}
 
diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc
index 71a97ad..81b07d7 100644
--- a/src/os/KeyValueStore.cc
+++ b/src/os/KeyValueStore.cc
@@ -718,6 +718,10 @@ int KeyValueStore::mkfs()
     delete store;
   }
 
+  ret = write_meta("type", "keyvaluestore");
+  if (ret < 0)
+    goto close_fsid_fd;
+
   dout(1) << "mkfs done in " << basedir << dendl;
   ret = 0;
 
@@ -1040,6 +1044,7 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
                                       TrackedOpRef osd_op,
                                       ThreadPool::TPHandle *handle)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   Context *onreadable;
   Context *ondisk;
   Context *onreadable_sync;
@@ -1067,6 +1072,8 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
   dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
   queue_op(osr, o);
 
+  utime_t end = ceph_clock_now(g_ceph_context);
+  perf_logger->tinc(l_os_queue_lat, end - start);
   return 0;
 }
 
@@ -1122,7 +1129,6 @@ void KeyValueStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handl
   perf_logger->set(l_os_oq_max_ops, max_ops);
   perf_logger->set(l_os_oq_max_bytes, max_bytes);
 
-  utime_t start = ceph_clock_now(g_ceph_context);
   if (handle)
     handle->suspend_tp_timeout();
   if (throttle_ops.should_wait(1) ||
@@ -1136,9 +1142,6 @@ void KeyValueStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handl
   if (handle)
     handle->reset_tp_timeout();
 
-  utime_t end = ceph_clock_now(g_ceph_context);
-  perf_logger->tinc(l_os_queue_lat, end - start);
-
   perf_logger->set(l_os_oq_ops, throttle_ops.get_current());
   perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current());
 }
@@ -1190,8 +1193,10 @@ void KeyValueStore::_finish_op(OpSequencer *osr)
   if (o->onreadable_sync) {
     o->onreadable_sync->complete(0);
   }
-  op_finisher.queue(o->onreadable);
-  op_finisher.queue(to_queue);
+  if (o->onreadable)
+    op_finisher.queue(o->onreadable);
+  if (!to_queue.empty())
+    op_finisher.queue(to_queue);
   delete o;
 }
 
diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h
index 4307901..914ce1d 100644
--- a/src/os/KeyValueStore.h
+++ b/src/os/KeyValueStore.h
@@ -494,10 +494,6 @@ class KeyValueStore : public ObjectStore,
   uint32_t get_target_version() {
     return target_version;
   }
-  int peek_journal_fsid(uuid_d *id) {
-    *id = fsid;
-    return 0;
-  }
 
   int write_version_stamp();
   int mount();
diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc
index 080f731..f87e628 100644
--- a/src/os/MemStore.cc
+++ b/src/os/MemStore.cc
@@ -41,12 +41,6 @@ bool operator>(const MemStore::CollectionRef& l,
 }
 
 
-int MemStore::peek_journal_fsid(uuid_d *fsid)
-{
-  *fsid = uuid_d();
-  return 0;
-}
-
 int MemStore::mount()
 {
   int r = _load();
@@ -230,6 +224,10 @@ int MemStore::mkfs()
   if (r < 0)
     return r;
 
+  r = write_meta("type", "memstore");
+  if (r < 0)
+    return r;
+
   return 0;
 }
 
diff --git a/src/os/MemStore.h b/src/os/MemStore.h
index efaa2cf..fc047c0 100644
--- a/src/os/MemStore.h
+++ b/src/os/MemStore.h
@@ -348,8 +348,6 @@ public:
       sharded(false) {}
   ~MemStore() { }
 
-  int peek_journal_fsid(uuid_d *fsid);
-
   bool test_mount_in_use() {
     return false;
   }
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index aa62fbd..e47a94c 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -85,6 +85,20 @@ ObjectStore *ObjectStore::create(CephContext *cct,
   return NULL;
 }
 
+int ObjectStore::probe_block_device_fsid(
+  const string& path,
+  uuid_d *fsid)
+{
+  int r;
+
+  // okay, try FileStore (journal).
+  r = FileStore::get_block_device_fsid(path, fsid);
+  if (r == 0)
+    return r;
+
+  return -EINVAL;
+}
+
 int ObjectStore::write_meta(const std::string& key,
 			    const std::string& value)
 {
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 3b00ac7..8f2b749 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -114,6 +114,16 @@ public:
 			     const string& journal,
 			     osflagbits_t flags = 0);
 
+  /**
+   * probe a block device to learn the uuid of the owning OSD
+   *
+   * @param cct cct
+   * @param path path to device
+   * @param fsid [out] osd uuid
+   */
+  static int probe_block_device_fsid(const string& path,
+				     uuid_d *fsid);
+
   Logger *logger;
 
   /**
@@ -1827,11 +1837,6 @@ public:
   virtual void collect_metadata(map<string,string> *pm) { }
 
   /**
-   * check the journal uuid/fsid, without opening
-   */
-  virtual int peek_journal_fsid(uuid_d *fsid) = 0;
-
-  /**
    * write_meta - write a simple configuration key out-of-band
    *
    * Write a simple key/value pair for basic store configuration
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index af3a888..04c6922 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -262,7 +262,6 @@ void WBThrottle::clear_object(const ghobject_t &hoid)
 void WBThrottle::throttle()
 {
   Mutex::Locker l(lock);
-  while (!stopping && beyond_limit()) {
+  while (!stopping && need_flush())
     cond.Wait(lock);
-  }
 }
diff --git a/src/os/WBThrottle.h b/src/os/WBThrottle.h
index d951943..f06ec87 100644
--- a/src/os/WBThrottle.h
+++ b/src/os/WBThrottle.h
@@ -18,7 +18,6 @@
 #include "include/unordered_map.h"
 #include <boost/tuple/tuple.hpp>
 #include "include/memory.h"
-#include "include/buffer.h"
 #include "common/Formatter.h"
 #include "common/hobject.h"
 #include "include/interval_set.h"
@@ -137,6 +136,14 @@ private:
     else
       return true;
   }
+  bool need_flush() const {
+    if (cur_ios < io_limits.second &&
+	pending_wbs.size() < fd_limits.second &&
+	cur_size < size_limits.second)
+      return false;
+    else
+      return true;
+  }
 
 public:
   WBThrottle(CephContext *cct);
diff --git a/src/os/fs/XFS.h b/src/os/fs/XFS.h
index 1c3c3c4..7262f76 100644
--- a/src/os/fs/XFS.h
+++ b/src/os/fs/XFS.h
@@ -18,7 +18,7 @@
 #include "FS.h"
 
 # ifndef XFS_SUPER_MAGIC
-static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
+#define XFS_SUPER_MAGIC 0x58465342
 # endif
 
 class XFS : public FS {
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc
index 46a16c3..53fe1a2 100644
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -137,6 +137,22 @@ int ECUtil::encode(
   return 0;
 }
 
+void ECUtil::HashInfo::append(uint64_t old_size,
+			      map<int, bufferlist> &to_append) {
+  assert(to_append.size() == cumulative_shard_hashes.size());
+  assert(old_size == total_chunk_size);
+  uint64_t size_to_append = to_append.begin()->second.length();
+  for (map<int, bufferlist>::iterator i = to_append.begin();
+       i != to_append.end();
+       ++i) {
+    assert(size_to_append == i->second.length());
+    assert((unsigned)i->first < cumulative_shard_hashes.size());
+    uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]);
+    cumulative_shard_hashes[i->first] = new_hash;
+  }
+  total_chunk_size += size_to_append;
+}
+
 void ECUtil::HashInfo::encode(bufferlist &bl) const
 {
   ENCODE_START(1, 1, bl);
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h
index 08b7f87..8e1261c 100644
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -20,7 +20,7 @@
 
 #include "include/memory.h"
 #include "erasure-code/ErasureCodeInterface.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/assert.h"
 #include "include/encoding.h"
 #include "common/Formatter.h"
@@ -112,20 +112,7 @@ public:
   HashInfo(unsigned num_chunks)
   : total_chunk_size(0),
     cumulative_shard_hashes(num_chunks, -1) {}
-  void append(uint64_t old_size, map<int, bufferlist> &to_append) {
-    assert(to_append.size() == cumulative_shard_hashes.size());
-    assert(old_size == total_chunk_size);
-    uint64_t size_to_append = to_append.begin()->second.length();
-    for (map<int, bufferlist>::iterator i = to_append.begin();
-	 i != to_append.end();
-	 ++i) {
-      assert(size_to_append == i->second.length());
-      assert((unsigned)i->first < cumulative_shard_hashes.size());
-      uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]);
-      cumulative_shard_hashes[i->first] = new_hash;
-    }
-    total_chunk_size += size_to_append;
-  }
+  void append(uint64_t old_size, map<int, bufferlist> &to_append);
   void clear() {
     total_chunk_size = 0;
     cumulative_shard_hashes = vector<uint32_t>(
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index c2267ef..9911548 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2177,6 +2177,8 @@ void OSD::create_logger()
       "Latency of client operations (including queue time)", "lat");       // client op latency
   osd_plb.add_time_avg(l_osd_op_process_lat, "op_process_latency", 
       "Latency of client operations (excluding queue time)");   // client op process latency
+  osd_plb.add_time_avg(l_osd_op_prepare_lat, "op_prepare_latency",
+      "Latency of client operations (excluding queue time and wait for finished)"); // client op prepare latency
 
   osd_plb.add_u64_counter(l_osd_op_r,      "op_r", 
       "Client read operations");        // client reads
@@ -2186,6 +2188,8 @@ void OSD::create_logger()
       "Latency of read operation (including queue time)");    // client read latency
   osd_plb.add_time_avg(l_osd_op_r_process_lat, "op_r_process_latency", 
       "Latency of read operation (excluding queue time)");   // client read process latency
+  osd_plb.add_time_avg(l_osd_op_r_prepare_lat, "op_r_prepare_latency",
+      "Latency of read operations (excluding queue time and wait for finished)"); // client read prepare latency
   osd_plb.add_u64_counter(l_osd_op_w,      "op_w", 
       "Client write operations");        // client writes
   osd_plb.add_u64_counter(l_osd_op_w_inb,  "op_w_in_bytes", 
@@ -2196,6 +2200,8 @@ void OSD::create_logger()
       "Latency of write operation (including queue time)");    // client write latency
   osd_plb.add_time_avg(l_osd_op_w_process_lat, "op_w_process_latency", 
       "Latency of write operation (excluding queue time)");   // client write process latency
+  osd_plb.add_time_avg(l_osd_op_w_prepare_lat, "op_w_prepare_latency",
+      "Latency of write operations (excluding queue time and wait for finished)"); // client write prepare latency
   osd_plb.add_u64_counter(l_osd_op_rw,     "op_rw", 
       "Client read-modify-write operations");       // client rmw
   osd_plb.add_u64_counter(l_osd_op_rw_inb, "op_rw_in_bytes", 
@@ -2208,6 +2214,8 @@ void OSD::create_logger()
       "Latency of read-modify-write operation (including queue time)");   // client rmw latency
   osd_plb.add_time_avg(l_osd_op_rw_process_lat, "op_rw_process_latency", 
       "Latency of read-modify-write operation (excluding queue time)");   // client rmw process latency
+  osd_plb.add_time_avg(l_osd_op_rw_prepare_lat, "op_rw_prepare_latency", 
+      "Latency of read-modify-write operations (excluding queue time and wait for finished)"); // client rmw prepare latency
 
   osd_plb.add_u64_counter(l_osd_sop,       "subop", "Suboperations");         // subops
   osd_plb.add_u64_counter(l_osd_sop_inb,   "subop_in_bytes", "Suboperations total size");     // subop in bytes
@@ -2234,6 +2242,10 @@ void OSD::create_logger()
 
   osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
   osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");       // total ceph::buffer bytes
+  osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");       // total ceph::buffer bytes in history
+  osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");       // total ceph::buffer num in history
+  osd_plb.add_u64(l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache"); // total ceph::buffer buffer_cached_crc_adjusted
+  osd_plb.add_u64(l_osd_cached_crc_adjusted, "cached_crc_adjusted", "Total number getting crc from crc_cache with adjusting"); // total ceph::buffer buffer_cached_crc_adjusted
 
   osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups");   // num pgs
   osd_plb.add_u64(l_osd_pg_primary, "numpg_primary", "Placement groups for which this osd is primary"); // num primary pgs
@@ -3950,6 +3962,10 @@ void OSD::tick()
   dout(5) << "tick" << dendl;
 
   logger->set(l_osd_buf, buffer::get_total_alloc());
+  logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
+  logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
+  logger->set(l_osd_cached_crc, buffer::get_cached_crc());
+  logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
 
   if (is_active() || is_waiting_for_healthy()) {
     map_lock.get_read();
@@ -5933,6 +5949,10 @@ void OSD::_dispatch(Message *m)
   dout(20) << "_dispatch " << m << " " << *m << dendl;
 
   logger->set(l_osd_buf, buffer::get_total_alloc());
+  logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
+  logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
+  logger->set(l_osd_cached_crc, buffer::get_cached_crc());
+  logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
 
   switch (m->get_type()) {
 
@@ -5985,6 +6005,8 @@ void OSD::_dispatch(Message *m)
   }
 
   logger->set(l_osd_buf, buffer::get_total_alloc());
+  logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
+  logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
 
 }
 
@@ -6051,19 +6073,25 @@ bool OSD::scrub_random_backoff()
   return false;
 }
 
-OSDService::ScrubJob::ScrubJob(const spg_t& pg, const utime_t& timestamp, bool must)
+OSDService::ScrubJob::ScrubJob(const spg_t& pg, const utime_t& timestamp,
+			       double pool_scrub_min_interval,
+			       double pool_scrub_max_interval, bool must)
   : pgid(pg),
     sched_time(timestamp),
     deadline(timestamp)
 {
   // if not explicitly requested, postpone the scrub with a random delay
   if (!must) {
-    sched_time += g_conf->osd_scrub_min_interval;
-    if (g_conf->osd_scrub_interval_randomize_ratio > 0) {
-      sched_time += rand() % (int)(g_conf->osd_scrub_min_interval *
-				   g_conf->osd_scrub_interval_randomize_ratio);
-    }
-    deadline += g_conf->osd_scrub_max_interval;
+    double scrub_min_interval = pool_scrub_min_interval > 0 ?
+      pool_scrub_min_interval : g_conf->osd_scrub_min_interval;
+    double scrub_max_interval = pool_scrub_max_interval > 0 ?
+      pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
+
+    sched_time += scrub_min_interval;
+    double r = rand() / RAND_MAX;
+    sched_time +=
+      scrub_min_interval * g_conf->osd_scrub_interval_randomize_ratio * r;
+    deadline += scrub_max_interval;
   }
 }
 
@@ -6154,7 +6182,7 @@ void OSD::sched_scrub()
 
       if (scrub.sched_time > now) {
 	// save ourselves some effort
-	dout(10) << "sched_scrub " << scrub.pgid << " schedued at " << scrub.sched_time
+	dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
 		 << " > " << now << dendl;
 	break;
       }
@@ -6488,6 +6516,7 @@ void OSD::handle_osd_map(MOSDMap *m)
 
   bool do_shutdown = false;
   bool do_restart = false;
+  bool network_error = false;
   if (osdmap->get_epoch() > 0 &&
       is_active()) {
     if (!osdmap->exists(whoami)) {
@@ -6539,16 +6568,22 @@ void OSD::handle_osd_map(MOSDMap *m)
 	avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
 
 	int r = cluster_messenger->rebind(avoid_ports);
-	if (r != 0)
+	if (r != 0) {
 	  do_shutdown = true;  // FIXME: do_restart?
+          network_error = true;
+        }
 
 	r = hb_back_server_messenger->rebind(avoid_ports);
-	if (r != 0)
+	if (r != 0) {
 	  do_shutdown = true;  // FIXME: do_restart?
+          network_error = true;
+        }
 
 	r = hb_front_server_messenger->rebind(avoid_ports);
-	if (r != 0)
+	if (r != 0) {
 	  do_shutdown = true;  // FIXME: do_restart?
+          network_error = true;
+        }
 
 	hbclient_messenger->mark_down_all();
 
@@ -6598,6 +6633,14 @@ void OSD::handle_osd_map(MOSDMap *m)
   else if (do_shutdown) {
     osd_lock.Unlock();
     shutdown();
+    if (network_error) {
+      map<int,pair<utime_t,entity_inst_t>>::iterator it = failure_pending.begin();
+      while (it != failure_pending.end()) {
+        dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << it->first << dendl;
+        send_still_alive(osdmap->get_epoch(), it->second.second);
+        failure_pending.erase(it++);
+      }
+    }
     osd_lock.Lock();
   }
   else if (is_preboot()) {
@@ -6609,7 +6652,6 @@ void OSD::handle_osd_map(MOSDMap *m)
   else if (do_restart)
     start_boot();
 
-
   m->put();
 }
 
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 8c0cd8e..367d236 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -67,21 +67,25 @@ enum {
   l_osd_op_outb,
   l_osd_op_lat,
   l_osd_op_process_lat,
+  l_osd_op_prepare_lat,
   l_osd_op_r,
   l_osd_op_r_outb,
   l_osd_op_r_lat,
   l_osd_op_r_process_lat,
+  l_osd_op_r_prepare_lat,
   l_osd_op_w,
   l_osd_op_w_inb,
   l_osd_op_w_rlat,
   l_osd_op_w_lat,
   l_osd_op_w_process_lat,
+  l_osd_op_w_prepare_lat,
   l_osd_op_rw,
   l_osd_op_rw_inb,
   l_osd_op_rw_outb,
   l_osd_op_rw_rlat,
   l_osd_op_rw_lat,
   l_osd_op_rw_process_lat,
+  l_osd_op_rw_prepare_lat,
 
   l_osd_sop,
   l_osd_sop_inb,
@@ -106,6 +110,10 @@ enum {
 
   l_osd_loadavg,
   l_osd_buf,
+  l_osd_history_alloc_bytes,
+  l_osd_history_alloc_num,
+  l_osd_cached_crc,
+  l_osd_cached_crc_adjusted,
 
   l_osd_pg,
   l_osd_pg_primary,
@@ -590,15 +598,19 @@ public:
     /// the hard upper bound of scrub time
     utime_t deadline;
     ScrubJob() {}
-    explicit ScrubJob(const spg_t& pg, const utime_t& timestamp, bool must = true);
+    explicit ScrubJob(const spg_t& pg, const utime_t& timestamp,
+		      double pool_scrub_min_interval = 0,
+		      double pool_scrub_max_interval = 0, bool must = true);
     /// order the jobs by sched_time
     bool operator<(const ScrubJob& rhs) const;
   };
   set<ScrubJob> sched_scrub_pg;
 
   /// @returns the scrub_reg_stamp used for unregister the scrub job
-  utime_t reg_pg_scrub(spg_t pgid, utime_t t, bool must) {
-    ScrubJob scrub(pgid, t, must);
+  utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
+		       double pool_scrub_max_interval, bool must) {
+    ScrubJob scrub(pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
+		   must);
     Mutex::Locker l(sched_scrub_lock);
     sched_scrub_pg.insert(scrub);
     return scrub.sched_time;
@@ -1617,9 +1629,9 @@ private:
       PrioritizedQueue< pair<PGRef, PGQueueable>, entity_inst_t> pqueue;
       ShardData(
 	string lock_name, string ordering_lock,
-	uint64_t max_tok_per_prio, uint64_t min_cost)
-	: sdata_lock(lock_name.c_str()),
-	  sdata_op_ordering_lock(ordering_lock.c_str()),
+	uint64_t max_tok_per_prio, uint64_t min_cost, CephContext *cct)
+	: sdata_lock(lock_name.c_str(), false, true, false, cct),
+	  sdata_op_ordering_lock(ordering_lock.c_str(), false, true, false, cct),
 	  pqueue(max_tok_per_prio, min_cost) {}
     };
     
@@ -1641,7 +1653,7 @@ private:
 	ShardData* one_shard = new ShardData(
 	  lock_name, order_lock,
 	  osd->cct->_conf->osd_op_pq_max_tokens_per_priority, 
-	  osd->cct->_conf->osd_op_pq_min_cost);
+	  osd->cct->_conf->osd_op_pq_min_cost, osd->cct);
 	shard_list.push_back(one_shard);
       }
     }
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 60fff4c..8805d1a 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -81,6 +81,7 @@ void OpRequest::_dump_op_descriptor_unlocked(ostream& stream) const
 void OpRequest::_unregistered() {
   request->clear_data();
   request->clear_payload();
+  request->release_message_throttle();
 }
 
 bool OpRequest::check_rmw(int flag) {
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 24145f4..ffbd6ff 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -222,7 +222,8 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
-  do_sort_bitwise(false)
+  do_sort_bitwise(false),
+  last_epoch(0)
 {
 #ifdef PG_DEBUG_REFS
   osd->add_pgid(p, this);
@@ -292,7 +293,8 @@ void PG::proc_master_log(
   if (oinfo.last_epoch_started > info.last_epoch_started)
     info.last_epoch_started = oinfo.last_epoch_started;
   info.history.merge(oinfo.history);
-  assert(info.last_epoch_started >= info.history.last_epoch_started);
+  assert(cct->_conf->osd_find_best_info_ignore_history_les ||
+	 info.last_epoch_started >= info.history.last_epoch_started);
 
   peer_missing[from].swap(omissing);
 }
@@ -2078,26 +2080,33 @@ void PG::mark_clean()
 unsigned PG::get_recovery_priority()
 {
   // a higher value -> a higher priority
-  return OSD_RECOVERY_PRIORITY_MAX;
+
+  int pool_recovery_priority = 0;
+  pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+
+  unsigned ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
+  if (ret > OSD_RECOVERY_PRIORITY_MAX)
+    ret = OSD_RECOVERY_PRIORITY_MAX;
+  return ret;
 }
 
 unsigned PG::get_backfill_priority()
 {
   // a higher value -> a higher priority
 
-  // undersized: 200 + num missing replicas
+  unsigned ret = OSD_BACKFILL_PRIORITY_BASE;
   if (is_undersized()) {
+    // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
     assert(pool.info.size > actingset.size());
-    return 200 + (pool.info.size - actingset.size());
-  }
+    ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
 
-  // degraded: baseline degraded
-  if (is_degraded()) {
-    return 200;
+  } else if (is_degraded()) {
+    // degraded: baseline degraded
+    ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
   }
+  assert (ret < OSD_RECOVERY_PRIORITY_MAX);
 
-  // baseline
-  return 1;
+  return ret;
 }
 
 void PG::finish_recovery(list<Context*>& tfin)
@@ -2441,6 +2450,19 @@ void PG::_update_calc_stats()
       pg_log.get_missing().num_missing();
     degraded += pg_log.get_missing().num_missing();
 
+    // num_objects_missing on each peer
+    for (map<pg_shard_t, pg_info_t>::iterator pi =
+        peer_info.begin();
+        pi != peer_info.end();
+        ++pi) {
+      map<pg_shard_t, pg_missing_t>::const_iterator pm =
+        peer_missing.find(pi->first);
+      if (pm != peer_missing.end()) {
+        pi->second.stats.stats.sum.num_objects_missing =
+          pm->second.num_missing();
+      }
+    }
+
     assert(!acting.empty());
     for (set<pg_shard_t>::iterator i = actingset.begin();
 	 i != actingset.end();
@@ -2703,11 +2725,13 @@ int PG::_prepare_write_info(map<string,bufferlist> *km,
 			    pg_info_t &info, coll_t coll,
 			    map<epoch_t,pg_interval_t> &past_intervals,
 			    ghobject_t &pgmeta_oid,
-			    bool dirty_big_info)
+			    bool dirty_big_info,
+			    bool dirty_epoch)
 {
   // info.  store purged_snaps separately.
   interval_set<snapid_t> purged_snaps;
-  ::encode(epoch, (*km)[epoch_key]);
+  if (dirty_epoch)
+    ::encode(epoch, (*km)[epoch_key]);
   purged_snaps.swap(info.purged_snaps);
   ::encode(info, (*km)[info_key]);
   purged_snaps.swap(info.purged_snaps);
@@ -2757,10 +2781,13 @@ void PG::prepare_write_info(map<string,bufferlist> *km)
   info.stats.stats.add(unstable_stats);
   unstable_stats.clear();
 
+  bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
   int ret = _prepare_write_info(km, get_osdmap()->get_epoch(), info, coll,
 				past_intervals, pgmeta_oid,
-				dirty_big_info);
+				dirty_big_info, need_update_epoch);
   assert(ret == 0);
+  if (need_update_epoch)
+    last_epoch = get_osdmap()->get_epoch();
   last_persisted_osdmap_ref = osdmap_ref;
 
   dirty_info = false;
@@ -2871,7 +2898,7 @@ void PG::write_if_dirty(ObjectStore::Transaction& t)
   map<string,bufferlist> km;
   if (dirty_big_info || dirty_info)
     prepare_write_info(&km);
-  pg_log.write_log(t, &km, coll, pgmeta_oid);
+  pg_log.write_log(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
   if (!km.empty())
     t.omap_setkeys(coll, pgmeta_oid, km);
 }
@@ -3245,8 +3272,13 @@ bool PG::sched_scrub()
     return false;
   }
 
-  bool time_for_deep = (ceph_clock_now(cct) >=
-    info.history.last_deep_scrub_stamp + cct->_conf->osd_deep_scrub_interval);
+  double deep_scrub_interval = 0;
+  pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+  if (deep_scrub_interval <= 0) {
+    deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+  }
+  bool time_for_deep = ceph_clock_now(cct) >=
+    info.history.last_deep_scrub_stamp + deep_scrub_interval;
 
   bool deep_coin_flip = false;
   // Only add random deep scrubs when NOT user initiated scrub
@@ -3334,8 +3366,13 @@ void PG::reg_next_scrub()
   }
   // note down the sched_time, so we can locate this scrub, and remove it
   // later on.
+  double scrub_min_interval = 0, scrub_max_interval = 0;
+  pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
+  pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
   scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
 					       reg_stamp,
+					       scrub_min_interval,
+					       scrub_max_interval,
 					       scrubber.must_scrub);
 }
 
@@ -3762,7 +3799,6 @@ void PG::replica_scrub(
   if (last_update_applied < msg->scrub_to) {
     dout(10) << "waiting for last_update_applied to catch up" << dendl;
     scrubber.active_rep_scrub = op;
-    msg->get();
     return;
   }
 
@@ -6513,11 +6549,11 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
   if (unfound > 0 &&
       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
-      pg->osd->clog->error() << pg->info.pgid << " has " << unfound
+      pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
 			    << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED\n";
       //pg->mark_all_unfound_lost(*context< RecoveryMachine >().get_cur_transaction());
     } else
-      pg->osd->clog->error() << pg->info.pgid << " has " << unfound << " objects unfound and apparently lost\n";
+      pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost\n";
   }
 
   if (!pg->snap_trimq.empty() &&
@@ -6637,6 +6673,12 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
     q.f->open_object_section("scrub");
     q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
     q.f->dump_int("scrubber.active", pg->scrubber.active);
+    q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
+    q.f->dump_stream("scrubber.start") << pg->scrubber.start;
+    q.f->dump_stream("scrubber.end") << pg->scrubber.end;
+    q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
+    q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
+    q.f->dump_int("scrubber.seed", pg->scrubber.seed);
     q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
     {
       q.f->open_array_section("scrubber.waiting_on_whom");
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 1c2c31c..5ab8d59 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -31,7 +31,7 @@
 #include "include/types.h"
 #include "include/stringify.h"
 #include "osd_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/xlist.h"
 #include "include/atomic.h"
 #include "SnapMapper.h"
@@ -2055,6 +2055,7 @@ public:
   uint64_t upacting_features;
 
   bool do_sort_bitwise;
+  epoch_t last_epoch;
 
  public:
   const spg_t&      get_pgid() const { return pg_id; }
@@ -2173,7 +2174,8 @@ public:
     pg_info_t &info, coll_t coll,
     map<epoch_t,pg_interval_t> &past_intervals,
     ghobject_t &pgmeta_oid,
-    bool dirty_big_info);
+    bool dirty_big_info,
+    bool dirty_epoch);
   void write_if_dirty(ObjectStore::Transaction& t);
 
   eversion_t get_next_version() const {
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index 7a7f6df..806a4a0 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -396,7 +396,7 @@ enum scrub_error_type PGBackend::be_compare_scrub_objects(
         errorstream << ", ";
       error = DEEP_ERROR;
       bool known = okseed && auth_oi.is_omap_digest() &&
-	auth.digest == auth_oi.omap_digest;
+	auth.omap_digest == auth_oi.omap_digest;
       errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
 		  << " != "
 		  << (known ? "known" : "best guess")
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 72520a7..1fabccd 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -524,6 +524,9 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead
   if (info.last_complete > newhead)
     info.last_complete = newhead;
 
+  if (log.rollback_info_trimmed_to > newhead)
+    log.rollback_info_trimmed_to = newhead;
+
   log.index();
 
   map<eversion_t, hobject_t> new_priors;
@@ -739,7 +742,8 @@ void PGLog::check() {
 void PGLog::write_log(
   ObjectStore::Transaction& t,
   map<string,bufferlist> *km,
-  const coll_t& coll, const ghobject_t &log_oid)
+  const coll_t& coll, const ghobject_t &log_oid,
+  bool require_rollback)
 {
   if (is_dirty()) {
     dout(5) << "write_log with: "
@@ -759,6 +763,7 @@ void PGLog::write_log(
       trimmed,
       dirty_divergent_priors,
       !touched_log,
+      require_rollback,
       (pg_log_debug ? &log_keys_debug : 0));
     undirty();
   } else {
@@ -771,13 +776,14 @@ void PGLog::write_log(
     map<string,bufferlist> *km,
     pg_log_t &log,
     const coll_t& coll, const ghobject_t &log_oid,
-    map<eversion_t, hobject_t> &divergent_priors)
+    map<eversion_t, hobject_t> &divergent_priors,
+    bool require_rollback)
 {
   _write_log(
     t, km, log, coll, log_oid,
     divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
     set<eversion_t>(),
-    true, true, 0);
+    true, true, require_rollback, 0);
 }
 
 void PGLog::_write_log(
@@ -792,6 +798,7 @@ void PGLog::_write_log(
   const set<eversion_t> &trimmed,
   bool dirty_divergent_priors,
   bool touch_log,
+  bool require_rollback,
   set<string> *log_keys_debug
   )
 {
@@ -856,8 +863,10 @@ void PGLog::_write_log(
     //dout(10) << "write_log: writing divergent_priors" << dendl;
     ::encode(divergent_priors, (*km)["divergent_priors"]);
   }
-  ::encode(log.can_rollback_to, (*km)["can_rollback_to"]);
-  ::encode(log.rollback_info_trimmed_to, (*km)["rollback_info_trimmed_to"]);
+  if (require_rollback) {
+    ::encode(log.can_rollback_to, (*km)["can_rollback_to"]);
+    ::encode(log.rollback_info_trimmed_to, (*km)["rollback_info_trimmed_to"]);
+  }
 
   if (!to_remove.empty())
     t.omap_rmkeys(coll, log_oid, to_remove);
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index fd82a70..a493488 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -647,14 +647,16 @@ public:
   void write_log(ObjectStore::Transaction& t,
 		 map<string,bufferlist> *km,
 		 const coll_t& coll,
-		 const ghobject_t &log_oid);
+		 const ghobject_t &log_oid,
+		 bool require_rollback);
 
   static void write_log(
     ObjectStore::Transaction& t,
     map<string,bufferlist>* km,
     pg_log_t &log,
     const coll_t& coll,
-    const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors);
+    const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
+    bool require_rollback);
 
   static void _write_log(
     ObjectStore::Transaction& t,
@@ -668,6 +670,7 @@ public:
     const set<eversion_t> &trimmed,
     bool dirty_divergent_priors,
     bool touch_log,
+    bool require_rollback,
     set<string> *log_keys_debug
     );
 
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 099fc64..6d33c01 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -674,6 +674,7 @@ template<typename T, int MSGTYPE>
 void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op)
 {
   T *r = static_cast<T *>(op->get_req());
+  r->finish_decode();
   assert(r->get_header().type == MSGTYPE);
   assert(MSGTYPE == MSG_OSD_SUBOPREPLY || MSGTYPE == MSG_OSD_REPOPREPLY);
 
@@ -817,8 +818,9 @@ void ReplicatedBackend::be_deep_scrub(
     }
     ++keys_scanned;
 
-    dout(25) << "CRC key " << iter->key() << " value "
-	     << string(iter->value().c_str(), iter->value().length()) << dendl;
+    dout(25) << "CRC key " << iter->key() << " value:\n";
+    iter->value().hexdump(*_dout);
+    *_dout << dendl;
 
     ::encode(iter->key(), bl);
     ::encode(iter->value(), bl);
@@ -1127,6 +1129,7 @@ template<typename T, int MSGTYPE>
 void ReplicatedBackend::sub_op_modify_impl(OpRequestRef op)
 {
   T *m = static_cast<T *>(op->get_req());
+  m->finish_decode();
   int msg_type = m->get_type();
   assert(MSGTYPE == msg_type);
   assert(msg_type == MSG_OSD_SUBOP || msg_type == MSG_OSD_REPOP);
@@ -1201,8 +1204,6 @@ void ReplicatedBackend::sub_op_modify_impl(OpRequestRef op)
 
   rm->bytes_written = rm->opt.get_encoded_bytes();
 
-  op->mark_started();
-
   rm->opt.register_on_commit(
     parent->bless_context(
       new C_OSD_RepModifyCommit(this, rm)));
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 64c77a4..ba7cfa9 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -2015,6 +2015,16 @@ void ReplicatedPG::do_op(OpRequestRef& op)
   ctx->src_obc.swap(src_obc);
 
   execute_ctx(ctx);
+  utime_t prepare_latency = ceph_clock_now(cct);
+  prepare_latency -= op->get_dequeued_time();
+  osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
+  if (op->may_read() && op->may_write()) {
+    osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
+  } else if (op->may_read()) {
+    osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
+  } else if (op->may_write() || op->may_cache()) {
+    osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
+  }
 }
 
 ReplicatedPG::cache_result_t ReplicatedPG::maybe_handle_cache_detail(
@@ -3117,7 +3127,7 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
 	get_osdmap()->get_epoch(),
 	m->query_epoch,
 	spg_t(info.pgid.pgid, primary.shard));
-      reply->set_priority(cct->_conf->osd_recovery_op_priority);
+      reply->set_priority(get_recovery_op_priority());
       osd->send_message_osd_cluster(reply, m->get_connection());
       queue_peering_event(
 	CephPeeringEvtRef(
@@ -10189,7 +10199,7 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
 	++skipped;
       } else {
 	int r = recover_missing(
-	  soid, need, cct->_conf->osd_recovery_op_priority, h);
+	  soid, need, get_recovery_op_priority(), h);
 	switch (r) {
 	case PULL_YES:
 	  ++started;
@@ -10212,7 +10222,7 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
       pg_log.set_last_requested(v);
   }
  
-  pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
+  pgbackend->run_recovery_op(h, get_recovery_op_priority());
   return started;
 }
 
@@ -10354,7 +10364,7 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
     }
   }
 
-  pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
+  pgbackend->run_recovery_op(h, get_recovery_op_priority());
   return started;
 }
 
@@ -10699,7 +10709,7 @@ int ReplicatedPG::recover_backfill(
     prep_backfill_object_push(to_push[i].get<0>(), to_push[i].get<1>(),
 	    to_push[i].get<2>(), to_push[i].get<3>(), h);
   }
-  pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
+  pgbackend->run_recovery_op(h, get_recovery_op_priority());
 
   dout(5) << "backfill_pos is " << backfill_pos << dendl;
   for (set<hobject_t, hobject_t::Comparator>::iterator i = backfills_in_flight.begin();
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index e31c9fd..9f328e5 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -1491,6 +1491,11 @@ private:
   hobject_t generate_temp_object();  ///< generate a new temp object name
   /// generate a new temp object name (for recovery)
   hobject_t get_temp_recovery_object(eversion_t version, snapid_t snap);
+  int get_recovery_op_priority() const {
+      int pri = 0;
+      pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+      return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+  }
   void log_missing(unsigned missing,
 			const boost::optional<hobject_t> &head,
 			LogChannelRef clog,
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 3cf41bd..de80857 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -15,6 +15,8 @@
  *
  */
 
+#include <boost/assign/list_of.hpp>
+
 #include "osd_types.h"
 #include "include/ceph_features.h"
 extern "C" {
@@ -889,6 +891,166 @@ void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
   o.back()->name = "foo";
 }
 
+// -- pool_opts_t --
+
+typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
+static opt_mapping_t opt_mapping = boost::assign::map_list_of
+	   ("scrub_min_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
+	   ("scrub_max_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
+	   ("deep_scrub_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
+           ("recovery_priority", pool_opts_t::opt_desc_t(
+             pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
+           ("recovery_op_priority", pool_opts_t::opt_desc_t(
+             pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT));
+
+bool pool_opts_t::is_opt_name(const std::string& name) {
+    return opt_mapping.find(name) != opt_mapping.end();
+}
+
+pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) {
+    opt_mapping_t::iterator i = opt_mapping.find(name);
+    assert(i != opt_mapping.end());
+    return i->second;
+}
+
+bool pool_opts_t::is_set(pool_opts_t::key_t key) const {
+    return opts.find(key) != opts.end();
+}
+
+const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const {
+  opts_t::const_iterator i = opts.find(key);
+  assert(i != opts.end());
+  return i->second;
+}
+
+bool pool_opts_t::unset(pool_opts_t::key_t key) {
+  return opts.erase(key) > 0;
+}
+
+class pool_opts_dumper_t : public boost::static_visitor<>
+{
+public:
+  pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
+    name(name_.c_str()), f(f_) {}
+
+  void operator()(std::string s) const {
+    f->dump_string(name, s);
+  }
+  void operator()(int i) const {
+    f->dump_int(name, i);
+  }
+  void operator()(double d) const {
+    f->dump_float(name, d);
+  }
+
+private:
+  const char* name;
+  Formatter* f;
+};
+
+void pool_opts_t::dump(const std::string& name, Formatter* f) const
+{
+  const opt_desc_t& desc = get_opt_desc(name);
+  opts_t::const_iterator i = opts.find(desc.key);
+  if (i == opts.end()) {
+      return;
+  }
+  boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
+}
+
+void pool_opts_t::dump(Formatter* f) const
+{
+  for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
+       i++) {
+    const std::string& name = i->first;
+    const opt_desc_t& desc = i->second;
+    opts_t::const_iterator j = opts.find(desc.key);
+    if (j == opts.end()) {
+      continue;
+    }
+    boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
+  }
+}
+
+class pool_opts_encoder_t : public boost::static_visitor<>
+{
+public:
+  pool_opts_encoder_t(bufferlist& bl_) : bl(bl_) {}
+
+  void operator()(std::string s) const {
+    ::encode(static_cast<int32_t>(pool_opts_t::STR), bl);
+    ::encode(s, bl);
+  }
+  void operator()(int i) const {
+    ::encode(static_cast<int32_t>(pool_opts_t::INT), bl);
+    ::encode(i, bl);
+  }
+  void operator()(double d) const {
+    ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
+    ::encode(d, bl);
+  }
+
+private:
+  bufferlist& bl;
+};
+
+void pool_opts_t::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  uint32_t n = static_cast<uint32_t>(opts.size());
+  ::encode(n, bl);
+  for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
+    ::encode(static_cast<int32_t>(i->first), bl);
+    boost::apply_visitor(pool_opts_encoder_t(bl), i->second);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void pool_opts_t::decode(bufferlist::iterator& bl) {
+  DECODE_START(1, bl);
+  __u32 n;
+  ::decode(n, bl);
+  opts.clear();
+  while (n--) {
+    int32_t k, t;
+    ::decode(k, bl);
+    ::decode(t, bl);
+    if (t == STR) {
+      std::string s;
+      ::decode(s, bl);
+      opts[static_cast<key_t>(k)] = s;
+    } else if (t == INT) {
+      int i;
+      ::decode(i, bl);
+      opts[static_cast<key_t>(k)] = i;
+    } else if (t == DOUBLE) {
+      double d;
+      ::decode(d, bl);
+      opts[static_cast<key_t>(k)] = d;
+    } else {
+      assert(!"invalid type");
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+ostream& operator<<(ostream& out, const pool_opts_t& opts)
+{
+  for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
+       i++) {
+    const std::string& name = i->first;
+    const pool_opts_t::opt_desc_t& desc = i->second;
+    pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
+    if (j == opts.opts.end()) {
+      continue;
+    }
+    out << " " << name << " " << j->second;
+  }
+  return out;
+}
+
 // -- pg_pool_t --
 
 void pg_pool_t::dump(Formatter *f) const
@@ -955,6 +1117,9 @@ void pg_pool_t::dump(Formatter *f) const
   f->dump_unsigned("stripe_width", get_stripe_width());
   f->dump_unsigned("expected_num_objects", expected_num_objects);
   f->dump_bool("fast_read", fast_read);
+  f->open_object_section("options");
+  opts.dump(f);
+  f->close_section(); // options
 }
 
 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
@@ -1264,7 +1429,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(23, 5, bl);
+  ENCODE_START(24, 5, bl);
   ::encode(type, bl);
   ::encode(size, bl);
   ::encode(crush_ruleset, bl);
@@ -1312,12 +1477,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(fast_read, bl);
   ::encode(hit_set_grade_decay_rate, bl);
   ::encode(hit_set_search_last_n, bl);
+  ::encode(opts, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_pool_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(23, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(24, 5, 5, bl);
   ::decode(type, bl);
   ::decode(size, bl);
   ::decode(crush_ruleset, bl);
@@ -1456,6 +1622,9 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
     hit_set_grade_decay_rate = 0;
     hit_set_search_last_n = 1;
   }
+  if (struct_v >= 24) {
+    ::decode(opts, bl);
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
   calc_grade_table();
@@ -1573,6 +1742,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
     out << " expected_num_objects " << p.expected_num_objects;
   if (p.fast_read)
     out << " fast_read " << p.fast_read;
+  out << p.opts;
   return out;
 }
 
@@ -1586,6 +1756,7 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_object_clones", num_object_clones);
   f->dump_int("num_object_copies", num_object_copies);
   f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
+  f->dump_int("num_objects_missing", num_objects_missing);
   f->dump_int("num_objects_degraded", num_objects_degraded);
   f->dump_int("num_objects_misplaced", num_objects_misplaced);
   f->dump_int("num_objects_unfound", num_objects_unfound);
@@ -1618,7 +1789,7 @@ void object_stat_sum_t::dump(Formatter *f) const
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(14, 3, bl);
+  ENCODE_START(15, 3, bl);
   ::encode(num_bytes, bl);
   ::encode(num_objects, bl);
   ::encode(num_object_clones, bl);
@@ -1652,6 +1823,7 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_evict_mode_some, bl);
   ::encode(num_evict_mode_full, bl);
   ::encode(num_objects_pinned, bl);
+  ::encode(num_objects_missing, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -1750,6 +1922,11 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
   } else {
     num_objects_pinned = 0;
   }
+  if (struct_v >= 15) {
+    ::decode(num_objects_missing, bl);
+  } else {
+    num_objects_missing = 0;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -1762,6 +1939,7 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
   a.num_object_clones = 4;
   a.num_object_copies = 5;
   a.num_objects_missing_on_primary = 6;
+  a.num_objects_missing = 123;
   a.num_objects_degraded = 7;
   a.num_objects_unfound = 8;
   a.num_rd = 9; a.num_rd_kb = 10;
@@ -1797,6 +1975,7 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_object_clones += o.num_object_clones;
   num_object_copies += o.num_object_copies;
   num_objects_missing_on_primary += o.num_objects_missing_on_primary;
+  num_objects_missing += o.num_objects_missing;
   num_objects_degraded += o.num_objects_degraded;
   num_objects_misplaced += o.num_objects_misplaced;
   num_rd += o.num_rd;
@@ -1834,6 +2013,7 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_object_clones -= o.num_object_clones;
   num_object_copies -= o.num_object_copies;
   num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
+  num_objects_missing -= o.num_objects_missing;
   num_objects_degraded -= o.num_objects_degraded;
   num_objects_misplaced -= o.num_objects_misplaced;
   num_rd -= o.num_rd;
@@ -1872,6 +2052,7 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
     l.num_object_clones == r.num_object_clones &&
     l.num_object_copies == r.num_object_copies &&
     l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
+    l.num_objects_missing == r.num_objects_missing &&
     l.num_objects_degraded == r.num_objects_degraded &&
     l.num_objects_misplaced == r.num_objects_misplaced &&
     l.num_objects_unfound == r.num_objects_unfound &&
@@ -2072,6 +2253,7 @@ void pg_stat_t::encode(bufferlist &bl) const
 
 void pg_stat_t::decode(bufferlist::iterator &bl)
 {
+  bool tmp;
   DECODE_START_LEGACY_COMPAT_LEN(22, 8, 8, bl);
   ::decode(version, bl);
   ::decode(reported_seq, bl);
@@ -2139,8 +2321,9 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
   }
   if (struct_v < 11) {
     stats_invalid = false;
-  } else {
-    ::decode(stats_invalid, bl);
+  } else {    
+    ::decode(tmp, bl);
+    stats_invalid = tmp;
   }
   if (struct_v >= 12) {
     ::decode(last_clean_scrub_stamp, bl);
@@ -2153,7 +2336,8 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
     last_became_active = last_active;
   }
   if (struct_v >= 14) {
-    ::decode(dirty_stats_invalid, bl);
+    ::decode(tmp, bl);
+    dirty_stats_invalid = tmp;
   } else {
     // if we are decoding an old encoding of this object, then the
     // encoder may not have supported num_objects_dirty accounting.
@@ -2167,14 +2351,16 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
     acting_primary = acting.size() ? acting[0] : -1;
   }
   if (struct_v >= 16) {
-    ::decode(omap_stats_invalid, bl);
+    ::decode(tmp, bl);
+    omap_stats_invalid = tmp;
   } else {
     // if we are decoding an old encoding of this object, then the
     // encoder may not have supported num_objects_omap accounting.
     omap_stats_invalid = true;
   }
   if (struct_v >= 17) {
-    ::decode(hitset_stats_invalid, bl);
+    ::decode(tmp, bl);
+    hitset_stats_invalid = tmp;
   } else {
     // if we are decoding an old encoding of this object, then the
     // encoder may not have supported num_objects_hit_set_archive accounting.
@@ -2193,7 +2379,8 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
     last_fullsized = utime_t();
   }
   if (struct_v >= 20) {
-    ::decode(hitset_bytes_stats_invalid, bl);
+    ::decode(tmp, bl);
+    hitset_bytes_stats_invalid = tmp;
   } else {
     // if we are decoding an old encoding of this object, then the
     // encoder may not have supported num_bytes_hit_set_archive accounting.
@@ -2207,7 +2394,8 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
     last_became_peered = last_became_active;
   }
   if (struct_v >= 22) {
-    ::decode(pin_stats_invalid, bl);
+    ::decode(tmp, bl);
+    pin_stats_invalid = tmp;
   } else {
     // if we are decoding an old encoding of this object, then the
     // encoder may not have supported num_objects_pinned accounting.
@@ -5107,11 +5295,14 @@ void ScrubMap::object::decode(bufferlist::iterator& bl)
 {
   DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
   ::decode(size, bl);
-  ::decode(negative, bl);
+  bool tmp;
+  ::decode(tmp, bl);
+  negative = tmp;
   ::decode(attrs, bl);
   if (struct_v >= 3) {
     ::decode(digest, bl);
-    ::decode(digest_present, bl);
+    ::decode(tmp, bl);
+    digest_present = tmp;
   }
   if (struct_v >= 4) {
     ::decode(nlinks, bl);
@@ -5123,10 +5314,12 @@ void ScrubMap::object::decode(bufferlist::iterator& bl)
   }
   if (struct_v >= 5) {
     ::decode(omap_digest, bl);
-    ::decode(omap_digest_present, bl);
+    ::decode(tmp, bl);
+    omap_digest_present = tmp;
   }
   if (struct_v >= 6) {
-    ::decode(read_error, bl);
+    ::decode(tmp, bl);
+    read_error = tmp;
   }
   DECODE_FINISH(bl);
 }
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 0f127f0..cb71218 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -23,6 +23,7 @@
 #include <memory>
 #include <boost/scoped_ptr.hpp>
 #include <boost/optional/optional_io.hpp>
+#include <boost/variant.hpp>
 
 #include "include/rados/rados_types.hpp"
 
@@ -62,6 +63,14 @@
 /// max recovery priority for MBackfillReserve
 #define OSD_RECOVERY_PRIORITY_MAX 255u
 
+/// base recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_BASE 230u
+
+/// base backfill priority for MBackfillReserve (degraded PG)
+#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 200u
+
+/// base backfill priority for MBackfillReserve
+#define OSD_BACKFILL_PRIORITY_BASE 1u
 
 typedef hobject_t collection_list_handle_t;
 
@@ -890,6 +899,82 @@ inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
 
 
 /*
+ * pool_opts_t
+ *
+ * pool options.
+ */
+
+class pool_opts_t {
+public:
+  enum key_t {
+    SCRUB_MIN_INTERVAL,
+    SCRUB_MAX_INTERVAL,
+    DEEP_SCRUB_INTERVAL,
+    RECOVERY_PRIORITY,
+    RECOVERY_OP_PRIORITY
+  };
+
+  enum type_t {
+    STR,
+    INT,
+    DOUBLE,
+  };
+
+  struct opt_desc_t {
+    key_t key;
+    type_t type;
+
+    opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
+
+    bool operator==(const opt_desc_t& rhs) const {
+      return key == rhs.key && type == rhs.type;
+    }
+  };
+
+  typedef boost::variant<std::string,int,double> value_t;
+
+  static bool is_opt_name(const std::string& name);
+  static opt_desc_t get_opt_desc(const std::string& name);
+
+  pool_opts_t() : opts() {}
+
+  bool is_set(key_t key) const;
+
+  template<typename T>
+  void set(key_t key, const T &val) {
+    value_t value = val;
+    opts[key] = value;
+  }
+
+  template<typename T>
+  bool get(key_t key, T *val) const {
+    opts_t::const_iterator i = opts.find(key);
+    if (i == opts.end()) {
+      return false;
+    }
+    *val = boost::get<T>(i->second);
+    return true;
+  }
+
+  const value_t& get(key_t key) const;
+
+  bool unset(key_t key);
+
+  void dump(const std::string& name, Formatter *f) const;
+
+  void dump(Formatter *f) const;
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &bl);
+
+private:
+  typedef std::map<key_t, value_t> opts_t;
+  opts_t opts;
+
+  friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
+};
+WRITE_CLASS_ENCODER(pool_opts_t)
+
+/*
  * pg_pool
  */
 struct pg_pool_t {
@@ -1136,6 +1221,8 @@ public:
                                  ///< user does not specify any expected value
   bool fast_read;            ///< whether turn on fast read on the pool or not
 
+  pool_opts_t opts; ///< options
+
 private:
   vector<uint32_t> grade_table;
 
@@ -1183,7 +1270,8 @@ public:
       hit_set_search_last_n(0),
       stripe_width(0),
       expected_num_objects(0),
-      fast_read(false)
+      fast_read(false),
+      opts()
   { }
 
   void dump(Formatter *f) const;
@@ -1358,6 +1446,7 @@ struct object_stat_sum_t {
   int64_t num_object_clones;
   int64_t num_object_copies;  // num_objects * num_replicas
   int64_t num_objects_missing_on_primary;
+  int64_t num_objects_missing;
   int64_t num_objects_degraded;
   int64_t num_objects_misplaced;
   int64_t num_objects_unfound;
@@ -1390,7 +1479,8 @@ struct object_stat_sum_t {
   object_stat_sum_t()
     : num_bytes(0),
       num_objects(0), num_object_clones(0), num_object_copies(0),
-      num_objects_missing_on_primary(0), num_objects_degraded(0),
+      num_objects_missing_on_primary(0), num_objects_missing(0),
+      num_objects_degraded(0),
       num_objects_misplaced(0),
       num_objects_unfound(0),
       num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
@@ -1421,6 +1511,7 @@ struct object_stat_sum_t {
     FLOOR(num_object_clones);
     FLOOR(num_object_copies);
     FLOOR(num_objects_missing_on_primary);
+    FLOOR(num_objects_missing);
     FLOOR(num_objects_degraded);
     FLOOR(num_objects_misplaced);
     FLOOR(num_objects_unfound);
@@ -1466,6 +1557,7 @@ struct object_stat_sum_t {
     SPLIT(num_object_clones);
     SPLIT(num_object_copies);
     SPLIT(num_objects_missing_on_primary);
+    SPLIT(num_objects_missing);
     SPLIT(num_objects_degraded);
     SPLIT(num_objects_misplaced);
     SPLIT(num_objects_unfound);
@@ -1608,7 +1700,6 @@ struct pg_stat_t {
   utime_t last_clean_scrub_stamp;
 
   object_stat_collection_t stats;
-  bool stats_invalid;
 
   int64_t log_size;
   int64_t ondisk_log_size;    // >= active_log_size
@@ -1621,34 +1712,35 @@ struct pg_stat_t {
   utime_t last_became_active;
   utime_t last_became_peered;
 
-  /// true if num_objects_dirty is not accurate (because it was not
-  /// maintained starting from pool creation)
-  bool dirty_stats_invalid;
-  bool omap_stats_invalid;
-  bool hitset_stats_invalid;
-  bool hitset_bytes_stats_invalid;
-  bool pin_stats_invalid;
-
   /// up, acting primaries
   int32_t up_primary;
   int32_t acting_primary;
 
+  bool stats_invalid:1;
+  /// true if num_objects_dirty is not accurate (because it was not
+  /// maintained starting from pool creation)
+  bool dirty_stats_invalid:1;
+  bool omap_stats_invalid:1;
+  bool hitset_stats_invalid:1;
+  bool hitset_bytes_stats_invalid:1;
+  bool pin_stats_invalid:1;
+
   pg_stat_t()
     : reported_seq(0),
       reported_epoch(0),
       state(0),
       created(0), last_epoch_clean(0),
       parent_split_bits(0),
-      stats_invalid(false),
       log_size(0), ondisk_log_size(0),
       mapping_epoch(0),
+      up_primary(-1),
+      acting_primary(-1),
+      stats_invalid(false),
       dirty_stats_invalid(false),
       omap_stats_invalid(false),
       hitset_stats_invalid(false),
       hitset_bytes_stats_invalid(false),
-      pin_stats_invalid(false),
-      up_primary(-1),
-      acting_primary(-1)
+      pin_stats_invalid(false)
   { }
 
   epoch_t get_effective_last_epoch_clean() const {
@@ -2341,34 +2433,30 @@ struct pg_log_entry_t {
     return get_op_name(op);
   }
 
-  __s32      op;
+  // describes state for a locally-rollbackable entry
+  ObjectModDesc mod_desc;
+  bufferlist snaps;   // only for clone entries
   hobject_t  soid;
+  osd_reqid_t reqid;  // caller+tid to uniquely identify request
+  vector<pair<osd_reqid_t, version_t> > extra_reqids;
   eversion_t version, prior_version, reverting_to;
   version_t user_version; // the user version for this entry
-  osd_reqid_t reqid;  // caller+tid to uniquely identify request
   utime_t     mtime;  // this is the _user_ mtime, mind you
-  bufferlist snaps;   // only for clone entries
+
+  __s32      op;
   bool invalid_hash; // only when decoding sobject_t based entries
   bool invalid_pool; // only when decoding pool-less hobject based entries
 
-  uint64_t offset;   // [soft state] my offset on disk
-
-  /// describes state for a locally-rollbackable entry
-  ObjectModDesc mod_desc;
-
-  vector<pair<osd_reqid_t, version_t> > extra_reqids;
-
   pg_log_entry_t()
-    : op(0), user_version(0),
-      invalid_hash(false), invalid_pool(false), offset(0) {}
-  pg_log_entry_t(int _op, const hobject_t& _soid, 
-		 const eversion_t& v, const eversion_t& pv,
-		 version_t uv,
-		 const osd_reqid_t& rid, const utime_t& mt)
-    : op(_op), soid(_soid), version(v),
-      prior_version(pv), user_version(uv),
-      reqid(rid), mtime(mt), invalid_hash(false), invalid_pool(false),
-      offset(0) {}
+   : user_version(0), op(0),
+     invalid_hash(false), invalid_pool(false) {}
+  pg_log_entry_t(int _op, const hobject_t& _soid,
+                const eversion_t& v, const eversion_t& pv,
+                version_t uv,
+                const osd_reqid_t& rid, const utime_t& mt)
+   : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
+     mtime(mt), op(_op), invalid_hash(false), invalid_pool(false)
+     {}
       
   bool is_clone() const { return op == CLONE; }
   bool is_modify() const { return op == MODIFY; }
@@ -2721,15 +2809,15 @@ WRITE_CLASS_ENCODER(pg_ls_response_t)
  * object_copy_cursor_t
  */
 struct object_copy_cursor_t {
-  bool attr_complete;
   uint64_t data_offset;
-  bool data_complete;
   string omap_offset;
+  bool attr_complete;
+  bool data_complete;
   bool omap_complete;
 
   object_copy_cursor_t()
-    : attr_complete(false),
-      data_offset(0),
+    : data_offset(0),
+      attr_complete(false),
       data_complete(false),
       omap_complete(false)
   {}
@@ -3187,19 +3275,17 @@ struct ObjectState {
     : oi(oi_), exists(exists_) {}
 };
 
-
 struct SnapSetContext {
   hobject_t oid;
-  int ref;
-  bool registered;
   SnapSet snapset;
-  bool exists;
+  int ref;
+  bool registered : 1;
+  bool exists : 1;
 
   SnapSetContext(const hobject_t& o) :
     oid(o), ref(0), registered(false), exists(true) { }
 };
 
-
 /*
   * keep tabs on object modifications that are in flight.
   * we need to know the projected existence, size, snapset,
@@ -3224,17 +3310,32 @@ public:
   Cond cond;
   int unstable_writes, readers, writers_waiting, readers_waiting;
 
-  /// in-progress copyfrom ops for this object
-  bool blocked;
 
   // set if writes for this object are blocked on another objects recovery
   ObjectContextRef blocked_by;      // object blocking our writes
   set<ObjectContextRef> blocking;   // objects whose writes we block
-  bool requeue_scrub_on_unblock;    // true if we need to requeue scrub on unblock
 
   // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers.
   map<pair<uint64_t, entity_name_t>, WatchRef> watchers;
 
+  // attr cache
+  map<string, bufferlist> attr_cache;
+
+  void fill_in_setattrs(const set<string> &changing, ObjectModDesc *mod) {
+    map<string, boost::optional<bufferlist> > to_set;
+    for (set<string>::const_iterator i = changing.begin();
+	 i != changing.end();
+	 ++i) {
+      map<string, bufferlist>::iterator iter = attr_cache.find(*i);
+      if (iter != attr_cache.end()) {
+	to_set[*i] = iter->second;
+      } else {
+	to_set[*i];
+      }
+    }
+    mod->setattrs(to_set);
+  }
+  
   struct RWState {
     enum State {
       RWNONE,
@@ -3255,19 +3356,18 @@ public:
       return get_state_name(state);
     }
 
-    State state;                 ///< rw state
-    uint64_t count;              ///< number of readers or writers
     list<OpRequestRef> waiters;  ///< ops waiting on state change
+    int count;              ///< number of readers or writers
 
+    State state:4;               ///< rw state
     /// if set, restart backfill when we can get a read lock
-    bool recovery_read_marker;
-
+    bool recovery_read_marker:1;
     /// if set, requeue snaptrim on lock release
-    bool snaptrimmer_write_marker;
+    bool snaptrimmer_write_marker:1;
 
     RWState()
-      : state(RWNONE),
-	count(0),
+      : count(0),
+	state(RWNONE),
 	recovery_read_marker(false),
 	snaptrimmer_write_marker(false)
     {}
@@ -3532,23 +3632,10 @@ public:
     lock.Unlock();
   }
 
-  // attr cache
-  map<string, bufferlist> attr_cache;
+  /// in-progress copyfrom ops for this object
+  bool blocked:1;
+  bool requeue_scrub_on_unblock:1;    // true if we need to requeue scrub on unblock
 
-  void fill_in_setattrs(const set<string> &changing, ObjectModDesc *mod) {
-    map<string, boost::optional<bufferlist> > to_set;
-    for (set<string>::const_iterator i = changing.begin();
-	 i != changing.end();
-	 ++i) {
-      map<string, bufferlist>::iterator iter = attr_cache.find(*i);
-      if (iter != attr_cache.end()) {
-	to_set[*i] = iter->second;
-      } else {
-	to_set[*i];
-      }
-    }
-    mod->setattrs(to_set);
-  }
 };
 
 inline ostream& operator<<(ostream& out, const ObjectState& obs)
@@ -3599,15 +3686,15 @@ WRITE_CLASS_ENCODER(ObjectRecoveryInfo)
 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
 
 struct ObjectRecoveryProgress {
-  bool first;
   uint64_t data_recovered_to;
-  bool data_complete;
   string omap_recovered_to;
+  bool first;
+  bool data_complete;
   bool omap_complete;
 
   ObjectRecoveryProgress()
-    : first(true),
-      data_recovered_to(0),
+    : data_recovered_to(0),
+      first(true),
       data_complete(false), omap_complete(false) { }
 
   bool is_complete(const ObjectRecoveryInfo& info) const {
@@ -3687,21 +3774,21 @@ ostream& operator<<(ostream& out, const PushOp &op);
  */
 struct ScrubMap {
   struct object {
-    uint64_t size;
-    bool negative;
     map<string,bufferptr> attrs;
-    __u32 digest;              ///< data crc32c
-    bool digest_present;
-    uint32_t nlinks;
     set<snapid_t> snapcolls;
+    uint64_t size;
     __u32 omap_digest;         ///< omap crc32c
-    bool omap_digest_present;
-    bool read_error;
+    __u32 digest;              ///< data crc32c
+    uint32_t nlinks;
+    bool negative:1;
+    bool digest_present:1;
+    bool omap_digest_present:1;
+    bool read_error:1;
 
     object() :
       // Init invalid size so it won't match if we get a stat EIO error
-      size(-1), negative(false), digest(0), digest_present(false),
-      nlinks(0), omap_digest(0), omap_digest_present(false),
+      size(-1), omap_digest(0), digest(0), nlinks(0), 
+      negative(false), digest_present(false), omap_digest_present(false), 
       read_error(false) {}
 
     void encode(bufferlist& bl) const;
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index ac09e70..e892ee0 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -128,14 +128,12 @@ struct ObjectOperation {
   }
   void add_data(int op, uint64_t off, uint64_t len, bufferlist& bl) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.extent.offset = off;
     osd_op.op.extent.length = len;
     osd_op.indata.claim_append(bl);
   }
   void add_clone_range(int op, uint64_t off, uint64_t len, const object_t& srcoid, uint64_t srcoff, snapid_t srcsnapid) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.clonerange.offset = off;
     osd_op.op.clonerange.length = len;
     osd_op.op.clonerange.src_offset = srcoff;
@@ -143,7 +141,6 @@ struct ObjectOperation {
   }
   void add_xattr(int op, const char *name, const bufferlist& data) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.xattr.name_len = (name ? strlen(name) : 0);
     osd_op.op.xattr.value_len = data.length();
     if (name)
@@ -152,7 +149,6 @@ struct ObjectOperation {
   }
   void add_xattr_cmp(int op, const char *name, uint8_t cmp_op, uint8_t cmp_mode, const bufferlist& data) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.xattr.name_len = (name ? strlen(name) : 0);
     osd_op.op.xattr.value_len = data.length();
     osd_op.op.xattr.cmp_op = cmp_op;
@@ -170,7 +166,6 @@ struct ObjectOperation {
     out_bl[p] = outbl;
     out_rval[p] = prval;
 
-    osd_op.op.op = op;
     osd_op.op.cls.class_len = strlen(cname);
     osd_op.op.cls.method_len = strlen(method);
     osd_op.op.cls.indata_len = indata.length();
@@ -180,14 +175,12 @@ struct ObjectOperation {
   }
   void add_pgls(int op, uint64_t count, collection_list_handle_t cookie, epoch_t start_epoch) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.pgls.count = count;
     osd_op.op.pgls.start_epoch = start_epoch;
     ::encode(cookie, osd_op.indata);
   }
   void add_pgls_filter(int op, uint64_t count, bufferlist& filter, collection_list_handle_t cookie, epoch_t start_epoch) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.pgls.count = count;
     osd_op.op.pgls.start_epoch = start_epoch;
     string cname = "pg";
@@ -200,7 +193,6 @@ struct ObjectOperation {
   void add_alloc_hint(int op, uint64_t expected_object_size,
                       uint64_t expected_write_size) {
     OSDOp& osd_op = add_op(op);
-    osd_op.op.op = op;
     osd_op.op.alloc_hint.expected_object_size = expected_object_size;
     osd_op.op.alloc_hint.expected_write_size = expected_write_size;
   }
@@ -535,7 +527,6 @@ struct ObjectOperation {
   }
   void tmap_to_omap(bool nullok=false) {
      OSDOp& osd_op = add_op(CEPH_OSD_OP_TMAP2OMAP);
-     osd_op.op.op = CEPH_OSD_OP_TMAP2OMAP;
      if (nullok)
        osd_op.op.tmap2omap.flags = CEPH_OSD_TMAP2OMAP_NULLOK;
   }
@@ -615,8 +606,6 @@ struct ObjectOperation {
     if (prval) {
       unsigned p = ops.size() - 1;
       out_rval[p] = prval;
-      out_bl[p] = NULL;
-      out_handler[p] = NULL;
     }
   }
 
diff --git a/src/pybind/Makefile.am b/src/pybind/Makefile.am
new file mode 100644
index 0000000..fd23fcb
--- /dev/null
+++ b/src/pybind/Makefile.am
@@ -0,0 +1,51 @@
+EXTRA_DIST += $(srcdir)/pybind/setup.py $(srcdir)/pybind/rbd.pyx
+
+if ENABLE_CLIENT
+if WITH_RADOS
+if WITH_RBD
+if WITH_CYTHON
+
+PY_DISTUTILS = \
+	CPPFLAGS="-iquote \${abs_srcdir}/include ${AM_CPPFLAGS} ${CPPFLAGS}" \
+	CFLAGS="-iquote \${abs_srcdir}/include ${AM_CFLAGS} ${CFLAGS}" \
+	LDFLAGS="-L\${abs_builddir}/.libs $(subst -pie,,${AM_LDFLAGS}) ${LDFLAGS}" \
+	CYTHON_BUILD_DIR="$(shell readlink -f $(builddir))/build" \
+	${PYTHON} ./setup.py
+
+pybind-all: librbd.la ${srcdir}/ceph_ver.h
+	cd $(srcdir)/pybind; $(PY_DISTUTILS) build \
+	--build-base $(shell readlink -f $(builddir))/build \
+	--verbose
+
+pybind-clean: ${srcdir}/ceph_ver.h
+	cd $(srcdir)/pybind; $(PY_DISTUTILS) clean \
+	--build-base $(shell readlink -f $(builddir))/build \
+	--verbose
+
+pybind-install-exec: ${srcdir}/ceph_ver.h
+	if test "$(DESTDIR)" ; then \
+		if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \
+			options=--install-layout=deb ; \
+		else \
+			options=--prefix=/usr ; \
+		fi ; \
+		root="--root=$(DESTDIR)" ; \
+	else \
+		options=--prefix=$(prefix) ; \
+	fi ; \
+	cd $(srcdir)/pybind; $(PY_DISTUTILS) build \
+	--build-base $(shell readlink -f $(builddir))/build \
+	install \
+	$$options $$root \
+	--single-version-externally-managed \
+	--record /dev/null \
+	--verbose
+
+LOCAL_ALL += pybind-all
+LOCAL_CLEAN += pybind-clean
+LOCAL_INSTALLEXEC += pybind-install-exec
+
+endif
+endif
+endif
+endif
diff --git a/src/pybind/cephfs.py b/src/pybind/cephfs.py
index df33da7..f11b6dc 100644
--- a/src/pybind/cephfs.py
+++ b/src/pybind/cephfs.py
@@ -509,6 +509,19 @@ class LibCephFS(object):
                           st_atime=statbuf.st_atime, st_mtime=statbuf.st_mtime,
                           st_ctime=statbuf.st_ctime)
 
+    def symlink(self, existing, newname):
+        if not isinstance(existing, str):
+            raise TypeError('existing must be a string')
+        if not isinstance(newname, str):
+            raise TypeError('newname must be a string')
+        self.require_state("mounted")
+        ret = self.libcephfs.ceph_symlink(
+            self.cluster,
+            c_char_p(existing),
+            c_char_p(newname))
+        if ret < 0:
+            raise make_ex(ret, "error in symlink")
+
     def unlink(self, path):
         self.require_state("mounted")
         ret = self.libcephfs.ceph_unlink(
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index 47a03a1..2d4022f 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -53,6 +53,9 @@ class PermissionError(Error):
     """ `PermissionError` class, derived from `Error` """
     pass
 
+class PermissionDeniedError(Error):
+    """ deal with EACCES related. """
+    pass
 
 class ObjectNotFound(Error):
     """ `ObjectNotFound` class, derived from `Error` """
@@ -134,7 +137,8 @@ def make_ex(ret, msg):
         errno.EBUSY     : ObjectBusy,
         errno.ENODATA   : NoData,
         errno.EINTR     : InterruptedOrTimeoutError,
-        errno.ETIMEDOUT : TimedOut
+        errno.ETIMEDOUT : TimedOut,
+        errno.EACCES    : PermissionDeniedError
         }
     ret = abs(ret)
     if ret in errors:
@@ -1663,6 +1667,37 @@ returned %d, but should return zero on success." % (self.name, ret))
             raise make_ex(ret, "Ioctx.read(%s): failed to read %s" % (self.name, key))
         return ctypes.string_at(ret_buf, ret)
 
+    @requires(('key', str_type), ('cls', str_type), ('method', str_type), ('data', bytes))
+    def execute(self, key, cls, method, data, length=8192):
+        """
+        Execute an OSD class method on an object.
+
+        :param key: name of the object
+        :type key: str
+        :param cls: name of the object class
+        :type cls: str
+        :param method: name of the method
+        :type method: str
+        :param data: input data
+        :type data: bytes
+        :param length: size of output buffer in bytes (default=8291)
+        :type length: int
+
+        :raises: :class:`TypeError`
+        :raises: :class:`Error`
+        :returns: (ret, method output)
+        """
+        self.require_ioctx_open()
+        ret_buf = create_string_buffer(length)
+        ret = run_in_thread(self.librados.rados_exec,
+                (self.io, cstr(key), cstr(cls), cstr(method),
+                    c_char_p(data), c_size_t(len(data)), ret_buf,
+                    c_size_t(length)))
+        if ret < 0:
+            raise make_ex(ret, "Ioctx.exec(%s): failed to exec %s:%s on %s" %
+                    (self.name, cls, method, key))
+        return ret, ctypes.string_at(ret_buf, min(ret, length))
+
     def get_stats(self):
         """
         Get pool usage statistics
@@ -1949,6 +1984,25 @@ returned %d, but should return zero on success." % (self.name, ret))
             raise make_ex(ret, "Failed to lookup snap %s" % snap_name)
         return Snap(self, snap_name, snap_id)
 
+    @requires(('oid', str_type), ('snap_name', str_type))
+    def snap_rollback(self, oid, snap_name):
+        """
+        Rollback an object to a snapshot
+
+        :param oid: the name of the object
+        :type oid: str
+        :param snap_name: the name of the snapshot
+        :type snap_name: str
+
+        :raises: :class:`TypeError`
+        :raises: :class:`Error`
+        """
+        self.require_ioctx_open()
+        ret = run_in_thread(self.librados.rados_ioctx_snap_rollback,
+                            (self.io, cstr(oid), cstr(snap_name)))
+        if (ret != 0):
+            raise make_ex(ret, "Failed to rollback %s" % oid)
+
     def get_last_version(self):
         """
         Return the version of the last object read or written to.
diff --git a/src/pybind/rbd.py b/src/pybind/rbd.py
deleted file mode 100644
index 8f910de..0000000
--- a/src/pybind/rbd.py
+++ /dev/null
@@ -1,1262 +0,0 @@
-"""
-This module is a thin wrapper around librbd.
-
-It currently provides all the synchronous methods of librbd that do
-not use callbacks.
-
-Error codes from librbd are turned into exceptions that subclass
-:class:`Error`. Almost all methods may raise :class:`Error`
-(the base class of all rbd exceptions), :class:`PermissionError`
-and :class:`IOError`, in addition to those documented for the
-method.
-"""
-# Copyright 2011 Josh Durgin
-from collections import Iterable
-from ctypes import CDLL, c_char, c_char_p, c_size_t, c_void_p, c_int, \
-    create_string_buffer, byref, Structure, c_uint64, c_int64, c_uint8, \
-    CFUNCTYPE
-from ctypes.util import find_library
-import ctypes
-import errno
-import sys
-
-from rados import cstr, decode_cstr
-
-ANONYMOUS_AUID = 0xffffffffffffffff
-ADMIN_AUID = 0
-
-RBD_FEATURE_LAYERING = 1
-RBD_FEATURE_STRIPINGV2 = 2
-RBD_FEATURE_EXCLUSIVE_LOCK = 4
-RBD_FEATURE_OBJECT_MAP = 8
-RBD_FEATURE_FAST_DIFF = 16
-RBD_FEATURE_DEEP_FLATTEN = 32
-RBD_FEATURE_JOURNALING = 64
-
-RBD_FEATURES_ALL = (RBD_FEATURE_LAYERING       |
-                    RBD_FEATURE_STRIPINGV2     |
-                    RBD_FEATURE_EXCLUSIVE_LOCK |
-                    RBD_FEATURE_OBJECT_MAP     |
-                    RBD_FEATURE_FAST_DIFF      |
-                    RBD_FEATURE_DEEP_FLATTEN   |
-                    RBD_FEATURE_JOURNALING)
-
-# features that make an image inaccessible for read or write by
-# clients that don't understand them
-RBD_FEATURES_INCOMPATIBLE = (RBD_FEATURE_LAYERING |
-                             RBD_FEATURE_STRIPINGV2)
-
-# features that make an image unwritable by clients that don't
-# understand them
-RBD_FEATURES_RW_INCOMPATIBLE = (RBD_FEATURES_INCOMPATIBLE  |
-                                RBD_FEATURE_EXCLUSIVE_LOCK |
-                                RBD_FEATURE_OBJECT_MAP     |
-                                RBD_FEATURE_FAST_DIFF      |
-                                RBD_FEATURE_DEEP_FLATTEN   |
-                                RBD_FEATURE_JOURNALING)
-
-# features that may be dynamically enabled or disabled
-RBD_FEATURES_MUTABLE = (RBD_FEATURE_EXCLUSIVE_LOCK |
-                        RBD_FEATURE_OBJECT_MAP     |
-                        RBD_FEATURE_FAST_DIFF      |
-                        RBD_FEATURE_JOURNALING)
-
-# features that only work when used with a single client
-# using the image for writes
-RBD_FEATURES_SINGLE_CLIENT = (RBD_FEATURE_EXCLUSIVE_LOCK |
-                              RBD_FEATURE_OBJECT_MAP     |
-                              RBD_FEATURE_FAST_DIFF      |
-                              RBD_FEATURE_JOURNALING)
-
-RBD_FLAG_OBJECT_MAP_INVALID = 1
-
-RBD_IMAGE_OPTION_FORMAT = 0
-RBD_IMAGE_OPTION_FEATURES = 1
-RBD_IMAGE_OPTION_ORDER = 2
-RBD_IMAGE_OPTION_STRIPE_UNIT = 3
-RBD_IMAGE_OPTION_STRIPE_COUNT = 4
-
-
-# Are we running Python 2.x
-_python2 = sys.hexversion < 0x03000000
-
-
-if _python2:
-    str_type = basestring
-else:
-    str_type = str
-
-
-class Error(Exception):
-    pass
-
-
-class PermissionError(Error):
-    pass
-
-
-class ImageNotFound(Error):
-    pass
-
-
-class ImageExists(Error):
-    pass
-
-
-class IOError(Error):
-    pass
-
-
-class NoSpace(Error):
-    pass
-
-
-class IncompleteWriteError(Error):
-    pass
-
-
-class InvalidArgument(Error):
-    pass
-
-
-class LogicError(Error):
-    pass
-
-
-class ReadOnlyImage(Error):
-    pass
-
-
-class ImageBusy(Error):
-    pass
-
-
-class ImageHasSnapshots(Error):
-    pass
-
-
-class FunctionNotSupported(Error):
-    pass
-
-
-class ArgumentOutOfRange(Error):
-    pass
-
-
-class ConnectionShutdown(Error):
-    pass
-
-
-class Timeout(Error):
-    pass
-
-
-def make_ex(ret, msg):
-    """
-    Translate a librbd return code into an exception.
-
-    :param ret: the return code
-    :type ret: int
-    :param msg: the error message to use
-    :type msg: str
-    :returns: a subclass of :class:`Error`
-    """
-    errors = {
-        errno.EPERM     : PermissionError,
-        errno.ENOENT    : ImageNotFound,
-        errno.EIO       : IOError,
-        errno.ENOSPC    : NoSpace,
-        errno.EEXIST    : ImageExists,
-        errno.EINVAL    : InvalidArgument,
-        errno.EROFS     : ReadOnlyImage,
-        errno.EBUSY     : ImageBusy,
-        errno.ENOTEMPTY : ImageHasSnapshots,
-        errno.ENOSYS    : FunctionNotSupported,
-        errno.EDOM      : ArgumentOutOfRange,
-        errno.ESHUTDOWN : ConnectionShutdown,
-        errno.ETIMEDOUT : Timeout,
-        }
-    ret = abs(ret)
-    if ret in errors:
-        return errors[ret](msg)
-    else:
-        return Error(msg + (": error code %d" % ret))
-
-
-class rbd_image_info_t(Structure):
-    _fields_ = [("size", c_uint64),
-                ("obj_size", c_uint64),
-                ("num_objs", c_uint64),
-                ("order", c_int),
-                ("block_name_prefix", c_char * 24),
-                ("parent_pool", c_int64),
-                ("parent_name", c_char * 96)]
-
-
-class rbd_snap_info_t(Structure):
-    _fields_ = [("id", c_uint64),
-                ("size", c_uint64),
-                ("name", c_char_p)]
-
-
-def load_librbd():
-    """
-    Load the librbd shared library.
-    """
-    librbd_path = find_library('rbd')
-    if librbd_path:
-        return CDLL(librbd_path)
-
-    # try harder, find_library() doesn't search LD_LIBRARY_PATH
-    # in addition, it doesn't seem work on centos 6.4 (see e46d2ca067b5)
-    try:
-        return CDLL('librbd.so.1')
-    except OSError as e:
-        raise EnvironmentError("Unable to load librbd: %s" % e)
-
-
-class RBD(object):
-    """
-    This class wraps librbd CRUD functions.
-    """
-    def __init__(self):
-        self.librbd = load_librbd()
-
-    def version(self):
-        """
-        Get the version number of the ``librbd`` C library.
-
-        :returns: a tuple of ``(major, minor, extra)`` components of the
-                  librbd version
-        """
-        major = c_int(0)
-        minor = c_int(0)
-        extra = c_int(0)
-        self.librbd.rbd_version(byref(major), byref(minor), byref(extra))
-        return (major.value, minor.value, extra.value)
-
-    def create(self, ioctx, name, size, order=None, old_format=True,
-               features=0, stripe_unit=0, stripe_count=0):
-        """
-        Create an rbd image.
-
-        :param ioctx: the context in which to create the image
-        :type ioctx: :class:`rados.Ioctx`
-        :param name: what the image is called
-        :type name: str
-        :param size: how big the image is in bytes
-        :type size: int
-        :param order: the image is split into (2**order) byte objects
-        :type order: int
-        :param old_format: whether to create an old-style image that
-                           is accessible by old clients, but can't
-                           use more advanced features like layering.
-        :type old_format: bool
-        :param features: bitmask of features to enable
-        :type features: int
-        :param stripe_unit: stripe unit in bytes (default 0 for object size)
-        :type stripe_unit: int
-        :param stripe_count: objects to stripe over before looping
-        :type stripe_count: int
-        :raises: :class:`ImageExists`
-        :raises: :class:`TypeError`
-        :raises: :class:`InvalidArgument`
-        :raises: :class:`FunctionNotSupported`
-        """
-        if order is None:
-            order = 0
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        if old_format:
-            if features != 0 or stripe_unit != 0 or stripe_count != 0:
-                raise InvalidArgument('format 1 images do not support feature'
-                                      ' masks or non-default striping')
-            ret = self.librbd.rbd_create(ioctx.io, cstr(name),
-                                         c_uint64(size),
-                                         byref(c_int(order)))
-        else:
-            if not hasattr(self.librbd, 'rbd_create2'):
-                raise FunctionNotSupported('installed version of librbd does'
-                                           ' not support format 2 images')
-            has_create4 = hasattr(self.librbd, 'rbd_create4')
-            has_create3 = hasattr(self.librbd, 'rbd_create3')
-            if (stripe_unit != 0 or stripe_count != 0) and not has_create3:
-                raise FunctionNotSupported('installed version of librbd does'
-                                           ' not support stripe unit or count')
-            if has_create4:
-                format = old_format and 1 or 2
-                opts = c_void_p()
-                self.librbd.rbd_image_options_create(byref(opts))
-                self.librbd.rbd_image_options_set_uint64(opts,
-                                                         RBD_IMAGE_OPTION_FORMAT,
-                                                         c_uint64(format))
-                self.librbd.rbd_image_options_set_uint64(opts,
-                                                         RBD_IMAGE_OPTION_FEATURES,
-                                                         c_uint64(features))
-                self.librbd.rbd_image_options_set_uint64(opts,
-                                                         RBD_IMAGE_OPTION_ORDER,
-                                                         c_uint64(order))
-                self.librbd.rbd_image_options_set_uint64(opts,
-                                                         RBD_IMAGE_OPTION_STRIPE_UNIT,
-                                                         c_uint64(stripe_unit))
-                self.librbd.rbd_image_options_set_uint64(opts,
-                                                         RBD_IMAGE_OPTION_STRIPE_COUNT,
-                                                         c_uint64(stripe_count))
-                ret = self.librbd.rbd_create4(ioctx.io, cstr(name),
-                                              c_uint64(size), opts)
-                self.librbd.rbd_image_options_get_uint64(opts,
-                                                         RBD_IMAGE_OPTION_ORDER,
-                                                         byref(c_uint64(order)))
-                self.librbd.rbd_image_options_destroy(opts)
-            elif has_create3:
-                ret = self.librbd.rbd_create3(ioctx.io, cstr(name),
-                                              c_uint64(size),
-                                              c_uint64(features),
-                                              byref(c_int(order)),
-                                              c_uint64(stripe_unit),
-                                              c_uint64(stripe_count))
-            else:
-                ret = self.librbd.rbd_create2(ioctx.io, cstr(name),
-                                              c_uint64(size),
-                                              c_uint64(features),
-                                              byref(c_int(order)))
-        if ret < 0:
-            raise make_ex(ret, 'error creating image')
-
-    def clone(self, p_ioctx, p_name, p_snapname, c_ioctx, c_name,
-              features=0, order=None, stripe_unit=0, stripe_count=0):
-        """
-        Clone a parent rbd snapshot into a COW sparse child.
-
-        :param p_ioctx: the parent context that represents the parent snap
-        :type ioctx: :class:`rados.Ioctx`
-        :param p_name: the parent image name
-        :type name: str
-        :param p_snapname: the parent image snapshot name
-        :type name: str
-        :param c_ioctx: the child context that represents the new clone
-        :type ioctx: :class:`rados.Ioctx`
-        :param c_name: the clone (child) name
-        :type name: str
-        :param features: bitmask of features to enable; if set, must include layering
-        :type features: int
-        :param order: the image is split into (2**order) byte objects
-        :type order: int
-        :param stripe_unit: stripe unit in bytes (default 0 for object size)
-        :type stripe_unit: int
-        :param stripe_count: objects to stripe over before looping
-        :type stripe_count: int
-        :raises: :class:`TypeError`
-        :raises: :class:`InvalidArgument`
-        :raises: :class:`ImageExists`
-        :raises: :class:`FunctionNotSupported`
-        :raises: :class:`ArgumentOutOfRange`
-        """
-        if order is None:
-            order = 0
-        if not isinstance(p_snapname, str_type) or not isinstance(p_name, str_type):
-            raise TypeError('parent name and snapname must be strings')
-        if not isinstance(c_name, str_type):
-            raise TypeError('child name must be a string')
-
-        has_clone3 = hasattr(self.librbd, 'rbd_clone3')
-        if (stripe_unit != 0 or stripe_count != 0) and not has_clone3:
-            raise FunctionNotSupported('installed version of librbd does'
-                                       ' not support stripe unit or count')
-        if has_clone3:
-            opts = c_void_p()
-            self.librbd.rbd_image_options_create(byref(opts))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_FEATURES,
-                                                     c_uint64(features))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_ORDER,
-                                                     c_uint64(order))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_STRIPE_UNIT,
-                                                     c_uint64(stripe_unit))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_STRIPE_COUNT,
-                                                     c_uint64(stripe_count))
-            ret = self.librbd.rbd_clone3(p_ioctx.io, cstr(p_name),
-                                         cstr(p_snapname),
-                                         c_ioctx.io, cstr(c_name),
-                                         opts)
-            self.librbd.rbd_image_options_get_uint64(opts,
-                                                     RBD_IMAGE_OPTION_ORDER,
-                                                     byref(c_uint64(order)))
-            self.librbd.rbd_image_options_destroy(opts)
-        else:
-            ret = self.librbd.rbd_clone(p_ioctx.io, cstr(p_name),
-                                        cstr(p_snapname),
-                                        c_ioctx.io, cstr(c_name),
-                                        c_uint64(features),
-                                        byref(c_int(order)))
-        if ret < 0:
-            raise make_ex(ret, 'error creating clone')
-
-    def list(self, ioctx):
-        """
-        List image names.
-
-        :param ioctx: determines which RADOS pool is read
-        :type ioctx: :class:`rados.Ioctx`
-        :returns: list -- a list of image names
-        """
-        size = c_size_t(512)
-        while True:
-            c_names = create_string_buffer(size.value)
-            ret = self.librbd.rbd_list(ioctx.io, byref(c_names), byref(size))
-            if ret >= 0:
-                break
-            elif ret != -errno.ERANGE:
-                raise make_ex(ret, 'error listing images')
-
-        return [decode_cstr(name) for name in c_names.raw.split(b'\0') if len(name) > 0]
-
-    def remove(self, ioctx, name):
-        """
-        Delete an RBD image. This may take a long time, since it does
-        not return until every object that comprises the image has
-        been deleted. Note that all snapshots must be deleted before
-        the image can be removed. If there are snapshots left,
-        :class:`ImageHasSnapshots` is raised. If the image is still
-        open, or the watch from a crashed client has not expired,
-        :class:`ImageBusy` is raised.
-
-        :param ioctx: determines which RADOS pool the image is in
-        :type ioctx: :class:`rados.Ioctx`
-        :param name: the name of the image to remove
-        :type name: str
-        :raises: :class:`ImageNotFound`, :class:`ImageBusy`,
-                 :class:`ImageHasSnapshots`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_remove(ioctx.io, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error removing image')
-
-    def rename(self, ioctx, src, dest):
-        """
-        Rename an RBD image.
-
-        :param ioctx: determines which RADOS pool the image is in
-        :type ioctx: :class:`rados.Ioctx`
-        :param src: the current name of the image
-        :type src: str
-        :param dest: the new name of the image
-        :type dest: str
-        :raises: :class:`ImageNotFound`, :class:`ImageExists`
-        """
-        if not isinstance(src, str_type) or not isinstance(dest, str_type):
-            raise TypeError('src and dest must be strings')
-        ret = self.librbd.rbd_rename(ioctx.io, cstr(src), cstr(dest))
-        if ret != 0:
-            raise make_ex(ret, 'error renaming image')
-
-
-class Image(object):
-    """
-    This class represents an RBD image. It is used to perform I/O on
-    the image and interact with snapshots.
-
-    **Note**: Any method of this class may raise :class:`ImageNotFound`
-    if the image has been deleted.
-    """
-
-    def __init__(self, ioctx, name, snapshot=None, read_only=False):
-        """
-        Open the image at the given snapshot.
-        If a snapshot is specified, the image will be read-only, unless
-        :func:`Image.set_snap` is called later.
-
-        If read-only mode is used, metadata for the :class:`Image`
-        object (such as which snapshots exist) may become obsolete. See
-        the C api for more details.
-
-        To clean up from opening the image, :func:`Image.close` should
-        be called.  For ease of use, this is done automatically when
-        an :class:`Image` is used as a context manager (see :pep:`343`).
-
-        :param ioctx: determines which RADOS pool the image is in
-        :type ioctx: :class:`rados.Ioctx`
-        :param name: the name of the image
-        :type name: str
-        :param snapshot: which snapshot to read from
-        :type snaphshot: str
-        :param read_only: whether to open the image in read-only mode
-        :type read_only: bool
-        """
-        self.closed = True
-        self.librbd = load_librbd()
-        self.image = c_void_p()
-        self.name = name
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        if snapshot is not None and not isinstance(snapshot, str_type):
-            raise TypeError('snapshot must be a string or None')
-        if read_only:
-            if not hasattr(self.librbd, 'rbd_open_read_only'):
-                raise FunctionNotSupported('installed version of librbd does '
-                                           'not support open in read-only mode')
-            ret = self.librbd.rbd_open_read_only(ioctx.io, cstr(name),
-                                                 byref(self.image),
-                                                 cstr(snapshot))
-        else:
-            ret = self.librbd.rbd_open(ioctx.io, cstr(name),
-                                       byref(self.image), cstr(snapshot))
-        if ret != 0:
-            raise make_ex(ret, 'error opening image %s at snapshot %s' % (name, snapshot))
-        self.closed = False
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, type_, value, traceback):
-        """
-        Closes the image. See :func:`close`
-        """
-        self.close()
-        return False
-
-    def close(self):
-        """
-        Release the resources used by this image object.
-
-        After this is called, this object should not be used.
-        """
-        if not self.closed:
-            self.closed = True
-            ret = self.librbd.rbd_close(self.image)
-            if ret < 0:
-                raise make_ex(ret, 'error while closing image %s' % (
-                              self.name,))
-
-    def __del__(self):
-        self.close()
-
-    def __str__(self):
-        s = "rbd.Image(" + dict.__repr__(self.__dict__) + ")"
-        return s
-
-    def resize(self, size):
-        """
-        Change the size of the image.
-
-        :param size: the new size of the image
-        :type size: int
-        """
-        ret = self.librbd.rbd_resize(self.image, c_uint64(size))
-        if ret < 0:
-            raise make_ex(ret, 'error resizing image %s' % (self.name,))
-
-    def stat(self):
-        """
-        Get information about the image. Currently parent pool and
-        parent name are always -1 and ''.
-
-        :returns: dict - contains the following keys:
-
-            * ``size`` (int) - the size of the image in bytes
-
-            * ``obj_size`` (int) - the size of each object that comprises the
-              image
-
-            * ``num_objs`` (int) - the number of objects in the image
-
-            * ``order`` (int) - log_2(object_size)
-
-            * ``block_name_prefix`` (str) - the prefix of the RADOS objects used
-              to store the image
-
-            * ``parent_pool`` (int) - deprecated
-
-            * ``parent_name``  (str) - deprecated
-
-            See also :meth:`format` and :meth:`features`.
-
-        """
-        info = rbd_image_info_t()
-        ret = self.librbd.rbd_stat(self.image, byref(info), ctypes.sizeof(info))
-        if ret != 0:
-            raise make_ex(ret, 'error getting info for image %s' % (self.name,))
-        return {
-            'size'              : info.size,
-            'obj_size'          : info.obj_size,
-            'num_objs'          : info.num_objs,
-            'order'             : info.order,
-            'block_name_prefix' : decode_cstr(info.block_name_prefix),
-            'parent_pool'       : info.parent_pool,
-            'parent_name'       : info.parent_name
-            }
-
-    def parent_info(self):
-        """
-        Get information about a cloned image's parent (if any)
-
-        :returns: tuple - ``(pool name, image name, snapshot name)`` components
-                  of the parent image
-        :raises: :class:`ImageNotFound` if the image doesn't have a parent
-        """
-        ret = -errno.ERANGE
-        size = 8
-        while ret == -errno.ERANGE and size <= 4096:
-            pool = create_string_buffer(size)
-            name = create_string_buffer(size)
-            snapname = create_string_buffer(size)
-            ret = self.librbd.rbd_get_parent_info(self.image, byref(pool),
-                                                  c_size_t(size),
-                                                  byref(name),
-                                                  c_size_t(size),
-                                                  byref(snapname),
-                                                  c_size_t(size))
-            if ret == -errno.ERANGE:
-                size *= 2
-
-        if ret != 0:
-            raise make_ex(ret, 'error getting parent info for image %s' % (self.name,))
-        return (decode_cstr(pool.value),
-                decode_cstr(name.value),
-                decode_cstr(snapname.value))
-
-    def old_format(self):
-        """
-        Find out whether the image uses the old RBD format.
-
-        :returns: bool - whether the image uses the old RBD format
-        """
-        old = c_uint8()
-        ret = self.librbd.rbd_get_old_format(self.image, byref(old))
-        if ret != 0:
-            raise make_ex(ret, 'error getting old_format for image' % (self.name))
-        return old.value != 0
-
-    def size(self):
-        """
-        Get the size of the image. If open to a snapshot, returns the
-        size of that snapshot.
-
-        :returns: the size of the image in bytes
-        """
-        image_size = c_uint64()
-        ret = self.librbd.rbd_get_size(self.image, byref(image_size))
-        if ret != 0:
-            raise make_ex(ret, 'error getting size for image' % (self.name))
-        return image_size.value
-
-    def features(self):
-        """
-        Gets the features bitmask of the image.
-
-        :returns: int - the features bitmask of the image
-        """
-        features = c_uint64()
-        ret = self.librbd.rbd_get_features(self.image, byref(features))
-        if ret != 0:
-            raise make_ex(ret, 'error getting features for image' % (self.name))
-        return features.value
-
-    def update_features(self, features, enabled):
-        """
-        Updates the features bitmask of the image by enabling/disabling
-        a single feature.  The feature must support the ability to be
-        dynamically enabled/disabled.
-
-        :param features: feature bitmask to enable/disable
-        :type features: int
-        :param enabled: whether to enable/disable the feature
-        :type enabled: bool
-        :raises: :class:`InvalidArgument`
-        """
-        ret = self.librbd.rbd_update_features(self.image, c_uint64(features),
-                                              c_uint8(enabled));
-        if ret != 0:
-            raise make_ex(ret, 'error updating features for image %s' %
-                               (self.name))
-
-    def overlap(self):
-        """
-        Gets the number of overlapping bytes between the image and its parent
-        image. If open to a snapshot, returns the overlap between the snapshot
-        and the parent image.
-
-        :returns: int - the overlap in bytes
-        :raises: :class:`ImageNotFound` if the image doesn't have a parent
-        """
-        overlap = c_uint64()
-        ret = self.librbd.rbd_get_overlap(self.image, byref(overlap))
-        if ret != 0:
-            raise make_ex(ret, 'error getting overlap for image' % (self.name))
-        return overlap.value
-
-    def flags(self):
-        """
-        Gets the flags bitmask of the image.
-
-        :returns: int - the flags bitmask of the image
-        """
-        flags = c_uint64()
-        ret = self.librbd.rbd_get_flags(self.image, byref(flags))
-        if ret != 0:
-            raise make_ex(ret, 'error getting flags for image' % (self.name))
-        return flags.value
-
-    def is_exclusive_lock_owner(self):
-        """
-        Gets the status of the image exclusive lock.
-
-        :returns: bool - true if the image is exclusively locked
-        """
-        owner = c_int()
-        ret = self.librbd.rbd_is_exclusive_lock_owner(self.image, byref(owner))
-        if ret != 0:
-            raise make_ex(ret, 'error getting lock status for image' % (self.name))
-        return owner.value == 1
-
-    def copy(self, dest_ioctx, dest_name, features=0, order=None, stripe_unit=0,
-             stripe_count=0):
-        """
-        Copy the image to another location.
-
-        :param dest_ioctx: determines which pool to copy into
-        :type dest_ioctx: :class:`rados.Ioctx`
-        :param dest_name: the name of the copy
-        :type dest_name: str
-        :param features: bitmask of features to enable; if set, must include layering
-        :type features: int
-        :param order: the image is split into (2**order) byte objects
-        :type order: int
-        :param stripe_unit: stripe unit in bytes (default 0 for object size)
-        :type stripe_unit: int
-        :param stripe_count: objects to stripe over before looping
-        :type stripe_count: int
-        :raises: :class:`TypeError`
-        :raises: :class:`InvalidArgument`
-        :raises: :class:`ImageExists`
-        :raises: :class:`FunctionNotSupported`
-        :raises: :class:`ArgumentOutOfRange`
-        """
-        if order is None:
-            order = 0
-        if not isinstance(dest_name, str_type):
-            raise TypeError('dest_name must be a string')
-        has_copy3 = hasattr(self.librbd, 'rbd_copy3')
-        if (stripe_unit != 0 or stripe_count != 0) and not has_copy3:
-            raise FunctionNotSupported('installed version of librbd does'
-                                       ' not support stripe unit or count')
-        if has_copy3:
-            opts = c_void_p()
-            self.librbd.rbd_image_options_create(byref(opts))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_FEATURES,
-                                                     c_uint64(features))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_ORDER,
-                                                     c_uint64(order))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_STRIPE_UNIT,
-                                                     c_uint64(stripe_unit))
-            self.librbd.rbd_image_options_set_uint64(opts,
-                                                     RBD_IMAGE_OPTION_STRIPE_COUNT,
-                                                     c_uint64(stripe_count))
-            ret = self.librbd.rbd_copy3(self.image, dest_ioctx.io,
-                                        cstr(dest_name), opts)
-            self.librbd.rbd_image_options_get_uint64(opts,
-                                                     RBD_IMAGE_OPTION_ORDER,
-                                                     byref(c_uint64(order)))
-            self.librbd.rbd_image_options_destroy(opts)
-        else:
-            ret = self.librbd.rbd_copy(self.image, dest_ioctx.io, cstr(dest_name))
-        if ret < 0:
-            raise make_ex(ret, 'error copying image %s to %s' % (self.name, dest_name))
-
-    def list_snaps(self):
-        """
-        Iterate over the snapshots of an image.
-
-        :returns: :class:`SnapIterator`
-        """
-        return SnapIterator(self)
-
-    def create_snap(self, name):
-        """
-        Create a snapshot of the image.
-
-        :param name: the name of the snapshot
-        :type name: str
-        :raises: :class:`ImageExists`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_create(self.image, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error creating snapshot %s from %s' % (name, self.name))
-
-    def rename_snap(self, srcname, dstname):
-        """
-        rename a snapshot of the image.
-
-        :param srcname: the src name of the snapshot
-        :type srcname: str
-        :param dstname: the dst name of the snapshot
-        :type dstname: str
-        :raises: :class:`ImageExists`
-        """
-        if not isinstance(srcname, str_type):
-            raise TypeError('src name must be a string')
-        if not isinstance(dstname, str_type):
-            raise TypeError('dst name must be a string')
-        ret = self.librbd.rbd_snap_rename(self.image, cstr(srcname), cstr(dstname))
-        if ret != 0:
-            raise make_ex(ret, 'error renaming snapshot of %s from %s to %s' % (self.name, srcname, dstname))
-
-    def remove_snap(self, name):
-        """
-        Delete a snapshot of the image.
-
-        :param name: the name of the snapshot
-        :type name: str
-        :raises: :class:`IOError`, :class:`ImageBusy`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_remove(self.image, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error removing snapshot %s from %s' % (name, self.name))
-
-    def rollback_to_snap(self, name):
-        """
-        Revert the image to its contents at a snapshot. This is a
-        potentially expensive operation, since it rolls back each
-        object individually.
-
-        :param name: the snapshot to rollback to
-        :type name: str
-        :raises: :class:`IOError`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_rollback(self.image, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error rolling back image %s to snapshot %s' % (self.name, name))
-
-    def protect_snap(self, name):
-        """
-        Mark a snapshot as protected. This means it can't be deleted
-        until it is unprotected.
-
-        :param name: the snapshot to protect
-        :type name: str
-        :raises: :class:`IOError`, :class:`ImageNotFound`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_protect(self.image, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error protecting snapshot %s@%s' % (self.name, name))
-
-    def unprotect_snap(self, name):
-        """
-        Mark a snapshot unprotected. This allows it to be deleted if
-        it was protected.
-
-        :param name: the snapshot to unprotect
-        :type name: str
-        :raises: :class:`IOError`, :class:`ImageNotFound`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_unprotect(self.image, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error unprotecting snapshot %s@%s' % (self.name, name))
-
-    def is_protected_snap(self, name):
-        """
-        Find out whether a snapshot is protected from deletion.
-
-        :param name: the snapshot to check
-        :type name: str
-        :returns: bool - whether the snapshot is protected
-        :raises: :class:`IOError`, :class:`ImageNotFound`
-        """
-        if not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        is_protected = c_int()
-        ret = self.librbd.rbd_snap_is_protected(self.image, cstr(name),
-                                                byref(is_protected))
-        if ret != 0:
-            raise make_ex(ret, 'error checking if snapshot %s@%s is protected' % (self.name, name))
-        return is_protected.value == 1
-
-    def set_snap(self, name):
-        """
-        Set the snapshot to read from. Writes will raise ReadOnlyImage
-        while a snapshot is set. Pass None to unset the snapshot
-        (reads come from the current image) , and allow writing again.
-
-        :param name: the snapshot to read from, or None to unset the snapshot
-        :type name: str or None
-        """
-        if name is not None and not isinstance(name, str_type):
-            raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_set(self.image, cstr(name))
-        if ret != 0:
-            raise make_ex(ret, 'error setting image %s to snapshot %s' % (self.name, name))
-
-    def read(self, offset, length, fadvise_flags=0):
-        """
-        Read data from the image. Raises :class:`InvalidArgument` if
-        part of the range specified is outside the image.
-
-        :param offset: the offset to start reading at
-        :type offset: int
-        :param length: how many bytes to read
-        :type length: int
-        :param fadvise_flags: fadvise flags for this read
-        :type fadvise_flags: int
-        :returns: str - the data read
-        :raises: :class:`InvalidArgument`, :class:`IOError`
-        """
-        ret_buf = create_string_buffer(length)
-        if fadvise_flags == 0:
-            ret = self.librbd.rbd_read(self.image, c_uint64(offset),
-                                       c_size_t(length), byref(ret_buf))
-        else:
-            ret = self.librbd.rbd_read2(self.image, c_uint64(offset),
-                                        c_size_t(length), byref(ret_buf),
-                                        c_int(fadvise_flags))
-        if ret < 0:
-            raise make_ex(ret, 'error reading %s %ld~%ld' % (self.image, offset, length))
-
-        return ctypes.string_at(ret_buf, ret)
-
-    def diff_iterate(self, offset, length, from_snapshot, iterate_cb,
-                     include_parent = True, whole_object = False):
-        """
-        Iterate over the changed extents of an image.
-
-        This will call iterate_cb with three arguments:
-
-        (offset, length, exists)
-
-        where the changed extent starts at offset bytes, continues for
-        length bytes, and is full of data (if exists is True) or zeroes
-        (if exists is False).
-
-        If from_snapshot is None, it is interpreted as the beginning
-        of time and this generates all allocated extents.
-
-        The end version is whatever is currently selected (via set_snap)
-        for the image.
-
-        Raises :class:`InvalidArgument` if from_snapshot is after
-        the currently set snapshot.
-
-        Raises :class:`ImageNotFound` if from_snapshot is not the name
-        of a snapshot of the image.
-
-        :param offset: start offset in bytes
-        :type offset: int
-        :param length: size of region to report on, in bytes
-        :type length: int
-        :param from_snapshot: starting snapshot name, or None
-        :type from_snapshot: str or None
-        :param iterate_cb: function to call for each extent
-        :type iterate_cb: function acception arguments for offset,
-                           length, and exists
-        :param include_parent: True if full history diff should include parent
-        :type include_parent: bool
-        :param whole_object: True if diff extents should cover whole object
-        :type whole_object: bool
-        :raises: :class:`InvalidArgument`, :class:`IOError`,
-                 :class:`ImageNotFound`
-        """
-        if from_snapshot is not None and not isinstance(from_snapshot, str_type):
-            raise TypeError('client must be a string')
-
-        RBD_DIFF_CB = CFUNCTYPE(c_int, c_uint64, c_size_t, c_int, c_void_p)
-        cb_holder = DiffIterateCB(iterate_cb)
-        cb = RBD_DIFF_CB(cb_holder.callback)
-        ret = self.librbd.rbd_diff_iterate2(self.image,
-                                            cstr(from_snapshot),
-                                            c_uint64(offset),
-                                            c_uint64(length),
-                                            c_uint8(include_parent),
-                                            c_uint8(whole_object),
-                                            cb,
-                                            c_void_p(None))
-        if ret < 0:
-            msg = 'error generating diff from snapshot %s' % from_snapshot
-            raise make_ex(ret, msg)
-
-    def write(self, data, offset, fadvise_flags=0):
-        """
-        Write data to the image. Raises :class:`InvalidArgument` if
-        part of the write would fall outside the image.
-
-        :param data: the data to be written
-        :type data: bytes
-        :param offset: where to start writing data
-        :type offset: int
-        :param fadvise_flags: fadvise flags for this write
-        :type fadvise_flags: int
-        :returns: int - the number of bytes written
-        :raises: :class:`IncompleteWriteError`, :class:`LogicError`,
-                 :class:`InvalidArgument`, :class:`IOError`
-        """
-        if not isinstance(data, bytes):
-            raise TypeError('data must be a byte string')
-        length = len(data)
-
-        if fadvise_flags == 0:
-            ret = self.librbd.rbd_write(self.image, c_uint64(offset),
-                                        c_size_t(length), c_char_p(data))
-        else:
-            ret = self.librbd.rbd_write2(self.image, c_uint64(offset),
-                                         c_size_t(length), c_char_p(data),
-                                         c_int(fadvise_flags))
-
-        if ret == length:
-            return ret
-        elif ret < 0:
-            raise make_ex(ret, "error writing to %s" % (self.name,))
-        elif ret < length:
-            raise IncompleteWriteError("Wrote only %ld out of %ld bytes" % (ret, length))
-        else:
-            raise LogicError("logic error: rbd_write(%s) \
-returned %d, but %d was the maximum number of bytes it could have \
-written." % (self.name, ret, length))
-
-    def discard(self, offset, length):
-        """
-        Trim the range from the image. It will be logically filled
-        with zeroes.
-        """
-        ret = self.librbd.rbd_discard(self.image,
-                                      c_uint64(offset),
-                                      c_uint64(length))
-        if ret < 0:
-            msg = 'error discarding region %d~%d' % (offset, length)
-            raise make_ex(ret, msg)
-
-    def flush(self):
-        """
-        Block until all writes are fully flushed if caching is enabled.
-        """
-        ret = self.librbd.rbd_flush(self.image)
-        if ret < 0:
-            raise make_ex(ret, 'error flushing image')
-
-    def invalidate_cache(self):
-        """
-        Drop any cached data for the image.
-        """
-        ret = self.librbd.rbd_invalidate_cache(self.image)
-        if ret < 0:
-            raise make_ex(ret, 'error invalidating cache')
-
-    def stripe_unit(self):
-        """
-        Returns the stripe unit used for the image.
-        """
-        stripe_unit = c_uint64()
-        ret = self.librbd.rbd_get_stripe_unit(self.image, byref(stripe_unit))
-        if ret != 0:
-            raise make_ex(ret, 'error getting stripe unit for image' % (self.name))
-        return stripe_unit.value
-
-    def stripe_count(self):
-        """
-        Returns the stripe count used for the image.
-        """
-        stripe_count = c_uint64()
-        ret = self.librbd.rbd_get_stripe_count(self.image, byref(stripe_count))
-        if ret != 0:
-            raise make_ex(ret, 'error getting stripe count for image' % (self.name))
-        return stripe_count.value
-
-    def flatten(self):
-        """
-        Flatten clone image (copy all blocks from parent to child)
-        """
-        ret = self.librbd.rbd_flatten(self.image)
-        if ret < 0:
-            raise make_ex(ret, "error flattening %s" % self.name)
-
-    def list_children(self):
-        """
-        List children of the currently set snapshot (set via set_snap()).
-
-        :returns: list - a list of (pool name, image name) tuples
-        """
-        pools_size = c_size_t(512)
-        images_size = c_size_t(512)
-        while True:
-            c_pools = create_string_buffer(pools_size.value)
-            c_images = create_string_buffer(images_size.value)
-            ret = self.librbd.rbd_list_children(self.image,
-                                                byref(c_pools),
-                                                byref(pools_size),
-                                                byref(c_images),
-                                                byref(images_size))
-            if ret >= 0:
-                break
-            elif ret != -errno.ERANGE:
-                raise make_ex(ret, 'error listing images')
-        if ret == 0:
-            return []
-        pools = map(decode_cstr, c_pools.raw[:pools_size.value - 1].split(b'\0'))
-        images = map(decode_cstr, c_images.raw[:images_size.value - 1].split(b'\0'))
-        return list(zip(pools, images))
-
-    def list_lockers(self):
-        """
-        List clients that have locked the image and information
-        about the lock.
-
-        :returns: dict - contains the following keys:
-
-                  * ``tag`` - the tag associated with the lock (every
-                    additional locker must use the same tag)
-                  * ``exclusive`` - boolean indicating whether the
-                     lock is exclusive or shared
-                  * ``lockers`` - a list of (client, cookie, address)
-                    tuples
-        """
-        clients_size = c_size_t(512)
-        cookies_size = c_size_t(512)
-        addrs_size = c_size_t(512)
-        tag_size = c_size_t(512)
-        exclusive = c_int(0)
-
-        while True:
-            c_clients = create_string_buffer(clients_size.value)
-            c_cookies = create_string_buffer(cookies_size.value)
-            c_addrs = create_string_buffer(addrs_size.value)
-            c_tag = create_string_buffer(tag_size.value)
-            ret = self.librbd.rbd_list_lockers(self.image,
-                                               byref(exclusive),
-                                               byref(c_tag),
-                                               byref(tag_size),
-                                               byref(c_clients),
-                                               byref(clients_size),
-                                               byref(c_cookies),
-                                               byref(cookies_size),
-                                               byref(c_addrs),
-                                               byref(addrs_size))
-            if ret >= 0:
-                break
-            elif ret != -errno.ERANGE:
-                raise make_ex(ret, 'error listing images')
-        if ret == 0:
-            return []
-        clients = [client.decode("utf-8") for client in c_clients.raw[:clients_size.value - 1].split(b'\0')]
-        cookies = [cookie.decode("utf-8") for cookie in c_cookies.raw[:cookies_size.value - 1].split(b'\0')]
-        addrs = [addr.decode("utf-8") for addr in c_addrs.raw[:addrs_size.value - 1].split(b'\0')]
-        return {
-            'tag'       : decode_cstr(c_tag),
-            'exclusive' : exclusive.value == 1,
-            'lockers'   : list(zip(clients, cookies, addrs)),
-            }
-
-    def lock_exclusive(self, cookie):
-        """
-        Take an exclusive lock on the image.
-
-        :raises: :class:`ImageBusy` if a different client or cookie locked it
-                 :class:`ImageExists` if the same client and cookie locked it
-        """
-        if not isinstance(cookie, str_type):
-            raise TypeError('cookie must be a string')
-        ret = self.librbd.rbd_lock_exclusive(self.image, cstr(cookie))
-        if ret < 0:
-            raise make_ex(ret, 'error acquiring exclusive lock on image')
-
-    def lock_shared(self, cookie, tag):
-        """
-        Take a shared lock on the image. The tag must match
-        that of the existing lockers, if any.
-
-        :raises: :class:`ImageBusy` if a different client or cookie locked it
-                 :class:`ImageExists` if the same client and cookie locked it
-        """
-        if not isinstance(cookie, str_type):
-            raise TypeError('cookie must be a string')
-        if not isinstance(tag, str_type):
-            raise TypeError('tag must be a string')
-        ret = self.librbd.rbd_lock_shared(self.image, cstr(cookie),
-                                          cstr(tag))
-        if ret < 0:
-            raise make_ex(ret, 'error acquiring shared lock on image')
-
-    def unlock(self, cookie):
-        """
-        Release a lock on the image that was locked by this rados client.
-        """
-        if not isinstance(cookie, str_type):
-            raise TypeError('cookie must be a string')
-        ret = self.librbd.rbd_unlock(self.image, cstr(cookie))
-        if ret < 0:
-            raise make_ex(ret, 'error unlocking image')
-
-    def break_lock(self, client, cookie):
-        """
-        Release a lock held by another rados client.
-        """
-        if not isinstance(client, str_type):
-            raise TypeError('client must be a string')
-        if not isinstance(cookie, str_type):
-            raise TypeError('cookie must be a string')
-        ret = self.librbd.rbd_break_lock(self.image, cstr(client),
-                                         cstr(cookie))
-        if ret < 0:
-            raise make_ex(ret, 'error unlocking image')
-
-
-class DiffIterateCB(object):
-    def __init__(self, cb):
-        self.cb = cb
-
-    def callback(self, offset, length, exists, unused):
-        self.cb(offset, length, exists == 1)
-        return 0
-
-
-class SnapIterator(Iterable):
-    """
-    Iterator over snapshot info for an image.
-
-    Yields a dictionary containing information about a snapshot.
-
-    Keys are:
-
-    * ``id`` (int) - numeric identifier of the snapshot
-
-    * ``size`` (int) - size of the image at the time of snapshot (in bytes)
-
-    * ``name`` (str) - name of the snapshot
-    """
-    def __init__(self, image):
-        self.librbd = image.librbd
-        num_snaps = c_int(10)
-        while True:
-            self.snaps = (rbd_snap_info_t * num_snaps.value)()
-            ret = self.librbd.rbd_snap_list(image.image, byref(self.snaps),
-                                            byref(num_snaps))
-            if ret >= 0:
-                self.num_snaps = ret
-                break
-            elif ret != -errno.ERANGE:
-                raise make_ex(ret, 'error listing snapshots for image %s' % (image.name,))
-
-    def __iter__(self):
-        for i in range(self.num_snaps):
-            yield {
-                'id'   : self.snaps[i].id,
-                'size' : self.snaps[i].size,
-                'name' : decode_cstr(self.snaps[i].name),
-                }
-
-    def __del__(self):
-        self.librbd.rbd_snap_list_end(self.snaps)
diff --git a/src/pybind/rbd.pyx b/src/pybind/rbd.pyx
new file mode 100644
index 0000000..9318997
--- /dev/null
+++ b/src/pybind/rbd.pyx
@@ -0,0 +1,1426 @@
+# cython: embedsignature=True
+"""
+This module is a thin wrapper around librbd.
+
+It currently provides all the synchronous methods of librbd that do
+not use callbacks.
+
+Error codes from librbd are turned into exceptions that subclass
+:class:`Error`. Almost all methods may raise :class:`Error`
+(the base class of all rbd exceptions), :class:`PermissionError`
+and :class:`IOError`, in addition to those documented for the
+method.
+"""
+# Copyright 2011 Josh Durgin
+# Copyright 2015 Hector Martin <marcan at marcan.st>
+
+from cpython cimport PyObject, ref, exc
+from libc cimport errno
+from libc.stdint cimport *
+from libc.stdlib cimport realloc, free
+
+from collections import Iterable
+
+cdef extern from "Python.h":
+    # These are in cpython/string.pxd, but use "object" types instead of
+    # PyObject*, which invokes assumptions in cpython that we need to
+    # legitimately break to implement zero-copy string buffers in Image.read().
+    # This is valid use of the Python API and documented as a special case.
+    PyObject *PyBytes_FromStringAndSize(char *v, Py_ssize_t len) except NULL
+    char* PyBytes_AsString(PyObject *string) except NULL
+    int _PyBytes_Resize(PyObject **string, Py_ssize_t newsize) except -1
+
+cdef extern from "rbd/librbd.h" nogil:
+    enum:
+        _RBD_FEATURE_LAYERING "RBD_FEATURE_LAYERING"
+        _RBD_FEATURE_STRIPINGV2 "RBD_FEATURE_STRIPINGV2"
+        _RBD_FEATURE_EXCLUSIVE_LOCK "RBD_FEATURE_EXCLUSIVE_LOCK"
+        _RBD_FEATURE_OBJECT_MAP "RBD_FEATURE_OBJECT_MAP"
+        _RBD_FEATURE_FAST_DIFF "RBD_FEATURE_FAST_DIFF"
+        _RBD_FEATURE_DEEP_FLATTEN "RBD_FEATURE_DEEP_FLATTEN"
+        _RBD_FEATURE_JOURNALING "RBD_FEATURE_JOURNALING"
+
+        _RBD_FEATURES_INCOMPATIBLE "RBD_FEATURES_INCOMPATIBLE"
+        _RBD_FEATURES_RW_INCOMPATIBLE "RBD_FEATURES_RW_INCOMPATIBLE"
+        _RBD_FEATURES_MUTABLE "RBD_FEATURES_MUTABLE"
+        _RBD_FEATURES_SINGLE_CLIENT "RBD_FEATURES_SINGLE_CLIENT"
+        _RBD_FEATURES_ALL "RBD_FEATURES_ALL"
+
+        _RBD_FLAG_OBJECT_MAP_INVALID "RBD_FLAG_OBJECT_MAP_INVALID"
+        _RBD_FLAG_FAST_DIFF_INVALID "RBD_FLAG_FAST_DIFF_INVALID"
+
+        _RBD_IMAGE_OPTION_FORMAT "RBD_IMAGE_OPTION_FORMAT"
+        _RBD_IMAGE_OPTION_FEATURES "RBD_IMAGE_OPTION_FEATURES"
+        _RBD_IMAGE_OPTION_ORDER "RBD_IMAGE_OPTION_ORDER"
+        _RBD_IMAGE_OPTION_STRIPE_UNIT "RBD_IMAGE_OPTION_STRIPE_UNIT"
+        _RBD_IMAGE_OPTION_STRIPE_COUNT "RBD_IMAGE_OPTION_STRIPE_COUNT"
+
+        RBD_MAX_BLOCK_NAME_SIZE
+        RBD_MAX_IMAGE_NAME_SIZE
+
+    ctypedef void* rados_ioctx_t
+    ctypedef void* rbd_image_t
+    ctypedef void* rbd_image_options_t
+
+    ctypedef struct rbd_image_info_t:
+        uint64_t size
+        uint64_t obj_size
+        uint64_t num_objs
+        int order
+        char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]
+        uint64_t parent_pool
+        char parent_name[RBD_MAX_IMAGE_NAME_SIZE]
+
+    ctypedef struct rbd_snap_info_t:
+        uint64_t id
+        uint64_t size
+        char *name
+
+
+    void rbd_version(int *major, int *minor, int *extra)
+
+    void rbd_image_options_create(rbd_image_options_t* opts)
+    void rbd_image_options_destroy(rbd_image_options_t opts)
+    int rbd_image_options_set_string(rbd_image_options_t opts, int optname,
+                                     const char* optval)
+    int rbd_image_options_set_uint64(rbd_image_options_t opts, int optname,
+                                     uint64_t optval)
+    int rbd_image_options_get_string(rbd_image_options_t opts, int optname,
+                                     char* optval, size_t maxlen)
+    int rbd_image_options_get_uint64(rbd_image_options_t opts, int optname,
+                                     uint64_t* optval)
+    int rbd_image_options_unset(rbd_image_options_t opts, int optname)
+    void rbd_image_options_clear(rbd_image_options_t opts)
+    int rbd_image_options_is_empty(rbd_image_options_t opts)
+
+    int rbd_list(rados_ioctx_t io, char *names, size_t *size)
+    int rbd_create(rados_ioctx_t io, const char *name, uint64_t size,
+                   int *order)
+    int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size,
+                    rbd_image_options_t opts)
+    int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+                   const char *p_snapname, rados_ioctx_t c_ioctx,
+                   const char *c_name, rbd_image_options_t c_opts)
+    int rbd_remove(rados_ioctx_t io, const char *name)
+    int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
+                   const char *destname)
+    int rbd_open(rados_ioctx_t io, const char *name,
+                 rbd_image_t *image, const char *snap_name)
+    int rbd_open_read_only(rados_ioctx_t io, const char *name,
+                           rbd_image_t *image, const char *snap_name)
+    int rbd_close(rbd_image_t image)
+    int rbd_resize(rbd_image_t image, uint64_t size)
+    int rbd_stat(rbd_image_t image, rbd_image_info_t *info, size_t infosize)
+    int rbd_get_old_format(rbd_image_t image, uint8_t *old)
+    int rbd_get_size(rbd_image_t image, uint64_t *size)
+    int rbd_get_features(rbd_image_t image, uint64_t *features)
+    int rbd_update_features(rbd_image_t image, uint64_t features,
+                            uint8_t enabled)
+    int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit)
+    int rbd_get_stripe_count(rbd_image_t image, uint64_t *stripe_count)
+    int rbd_get_overlap(rbd_image_t image, uint64_t *overlap)
+    int rbd_get_parent_info(rbd_image_t image,
+                            char *parent_poolname, size_t ppoolnamelen,
+                            char *parent_name, size_t pnamelen,
+                            char *parent_snapname, size_t psnapnamelen)
+    int rbd_get_flags(rbd_image_t image, uint64_t *flags)
+    int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner)
+    ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+                      char *buf, int op_flags)
+    ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+                       const char *buf, int op_flags)
+    int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len)
+    int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+                  const char *destname, rbd_image_options_t dest_opts)
+    int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+                      int *max_snaps)
+    void rbd_snap_list_end(rbd_snap_info_t *snaps)
+    int rbd_snap_create(rbd_image_t image, const char *snapname)
+    int rbd_snap_remove(rbd_image_t image, const char *snapname)
+    int rbd_snap_rollback(rbd_image_t image, const char *snapname)
+    int rbd_snap_rename(rbd_image_t image, const char *snapname,
+                        const char* dstsnapsname)
+    int rbd_snap_protect(rbd_image_t image, const char *snap_name)
+    int rbd_snap_unprotect(rbd_image_t image, const char *snap_name)
+    int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+                              int *is_protected)
+    int rbd_snap_set(rbd_image_t image, const char *snapname)
+    int rbd_flatten(rbd_image_t image)
+    ssize_t rbd_list_children(rbd_image_t image, char *pools, size_t *pools_len,
+                              char *images, size_t *images_len)
+    ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+                             char *tag, size_t *tag_len,
+                             char *clients, size_t *clients_len,
+                             char *cookies, size_t *cookies_len,
+                             char *addrs, size_t *addrs_len)
+    int rbd_lock_exclusive(rbd_image_t image, const char *cookie)
+    int rbd_lock_shared(rbd_image_t image, const char *cookie,
+                        const char *tag)
+    int rbd_unlock(rbd_image_t image, const char *cookie)
+    int rbd_break_lock(rbd_image_t image, const char *client,
+                       const char *cookie)
+
+    # We use -9000 to propagate Python exceptions. We use except? to make sure
+    # things still work as intended if -9000 happens to be a valid errno value
+    # somewhere.
+    int rbd_diff_iterate2(rbd_image_t image, const char *fromsnapname,
+                         uint64_t ofs, uint64_t len,
+                         uint8_t include_parent, uint8_t whole_object,
+                         int (*cb)(uint64_t, size_t, int, void *)
+                             nogil except? -9000,
+                         void *arg) except? -9000
+
+    int rbd_flush(rbd_image_t image)
+    int rbd_invalidate_cache(rbd_image_t image)
+
+
+RBD_FEATURE_LAYERING = _RBD_FEATURE_LAYERING
+RBD_FEATURE_STRIPINGV2 = _RBD_FEATURE_STRIPINGV2
+RBD_FEATURE_EXCLUSIVE_LOCK = _RBD_FEATURE_EXCLUSIVE_LOCK
+RBD_FEATURE_OBJECT_MAP = _RBD_FEATURE_OBJECT_MAP
+RBD_FEATURE_FAST_DIFF = _RBD_FEATURE_FAST_DIFF
+RBD_FEATURE_DEEP_FLATTEN = _RBD_FEATURE_DEEP_FLATTEN
+RBD_FEATURE_JOURNALING = _RBD_FEATURE_JOURNALING
+
+RBD_FEATURES_INCOMPATIBLE = _RBD_FEATURES_INCOMPATIBLE
+RBD_FEATURES_RW_INCOMPATIBLE = _RBD_FEATURES_RW_INCOMPATIBLE
+RBD_FEATURES_MUTABLE = _RBD_FEATURES_MUTABLE
+RBD_FEATURES_SINGLE_CLIENT = _RBD_FEATURES_SINGLE_CLIENT
+RBD_FEATURES_ALL = _RBD_FEATURES_ALL
+
+RBD_FLAG_OBJECT_MAP_INVALID = _RBD_FLAG_OBJECT_MAP_INVALID
+
+RBD_IMAGE_OPTION_FORMAT = _RBD_IMAGE_OPTION_FORMAT
+RBD_IMAGE_OPTION_FEATURES = _RBD_IMAGE_OPTION_FEATURES
+RBD_IMAGE_OPTION_ORDER = _RBD_IMAGE_OPTION_ORDER
+RBD_IMAGE_OPTION_STRIPE_UNIT = _RBD_IMAGE_OPTION_STRIPE_UNIT
+RBD_IMAGE_OPTION_STRIPE_COUNT = _RBD_IMAGE_OPTION_STRIPE_COUNT
+
+
+class Error(Exception):
+    pass
+
+
+class PermissionError(Error):
+    pass
+
+
+class ImageNotFound(Error):
+    pass
+
+
+class ImageExists(Error):
+    pass
+
+
+class IOError(Error):
+    pass
+
+
+class NoSpace(Error):
+    pass
+
+
+class IncompleteWriteError(Error):
+    pass
+
+
+class InvalidArgument(Error):
+    pass
+
+
+class LogicError(Error):
+    pass
+
+
+class ReadOnlyImage(Error):
+    pass
+
+
+class ImageBusy(Error):
+    pass
+
+
+class ImageHasSnapshots(Error):
+    pass
+
+
+class FunctionNotSupported(Error):
+    pass
+
+
+class ArgumentOutOfRange(Error):
+    pass
+
+
+class ConnectionShutdown(Error):
+    pass
+
+
+class Timeout(Error):
+    pass
+
+
+cdef errno_to_exception = {
+    errno.EPERM     : PermissionError,
+    errno.ENOENT    : ImageNotFound,
+    errno.EIO       : IOError,
+    errno.ENOSPC    : NoSpace,
+    errno.EEXIST    : ImageExists,
+    errno.EINVAL    : InvalidArgument,
+    errno.EROFS     : ReadOnlyImage,
+    errno.EBUSY     : ImageBusy,
+    errno.ENOTEMPTY : ImageHasSnapshots,
+    errno.ENOSYS    : FunctionNotSupported,
+    errno.EDOM      : ArgumentOutOfRange,
+    errno.ESHUTDOWN : ConnectionShutdown,
+    errno.ETIMEDOUT : Timeout,
+}
+
+cdef make_ex(ret, msg):
+    """
+    Translate a librbd return code into an exception.
+
+    :param ret: the return code
+    :type ret: int
+    :param msg: the error message to use
+    :type msg: str
+    :returns: a subclass of :class:`Error`
+    """
+    ret = abs(ret)
+    if ret in errno_to_exception:
+        return errno_to_exception[ret](msg)
+    else:
+        return Error(msg + (": error code %d" % ret))
+
+cdef rados_ioctx_t convert_ioctx(ioctx) except? NULL:
+    return <rados_ioctx_t><uintptr_t>ioctx.io.value
+
+def cstr(val, name, encoding="utf-8", opt=False):
+    """
+    Create a byte string from a Python string
+
+    :param basestring val: Python string
+    :param str name: Name of the string parameter, for exceptions
+    :param str encoding: Encoding to use
+    :param bool opt: If True, None is allowed
+    :rtype: bytes
+    :raises: :class:`InvalidArgument`
+    """
+    if opt and val is None:
+        return None
+    if isinstance(val, bytes):
+        return val
+    elif isinstance(val, unicode):
+        return val.encode(encoding)
+    else:
+        raise InvalidArgument('%s must be a string' % name)
+
+def decode_cstr(val, encoding="utf-8"):
+    """
+    Decode a byte string into a Python string.
+
+    :param bytes val: byte string
+    :rtype: unicode or None
+    """
+    if val is None:
+        return None
+
+    return val.decode(encoding)
+
+
+cdef char* opt_str(s) except? NULL:
+    if s is None:
+        return NULL
+    return s
+
+cdef void* realloc_chk(void* ptr, size_t size) except NULL:
+    cdef void *ret = realloc(ptr, size)
+    if ret == NULL:
+        raise MemoryError("realloc failed")
+    return ret
+
+class RBD(object):
+    """
+    This class wraps librbd CRUD functions.
+    """
+    def version(self):
+        """
+        Get the version number of the ``librbd`` C library.
+
+        :returns: a tuple of ``(major, minor, extra)`` components of the
+                  librbd version
+        """
+        cdef int major = 0
+        cdef int minor = 0
+        cdef int extra = 0
+        rbd_version(&major, &minor, &extra)
+        return (major, minor, extra)
+
+    def create(self, ioctx, name, size, order=None, old_format=True,
+               features=0, stripe_unit=0, stripe_count=0):
+        """
+        Create an rbd image.
+
+        :param ioctx: the context in which to create the image
+        :type ioctx: :class:`rados.Ioctx`
+        :param name: what the image is called
+        :type name: str
+        :param size: how big the image is in bytes
+        :type size: int
+        :param order: the image is split into (2**order) byte objects
+        :type order: int
+        :param old_format: whether to create an old-style image that
+                           is accessible by old clients, but can't
+                           use more advanced features like layering.
+        :type old_format: bool
+        :param features: bitmask of features to enable
+        :type features: int
+        :param stripe_unit: stripe unit in bytes (default 0 for object size)
+        :type stripe_unit: int
+        :param stripe_count: objects to stripe over before looping
+        :type stripe_count: int
+        :raises: :class:`ImageExists`
+        :raises: :class:`TypeError`
+        :raises: :class:`InvalidArgument`
+        :raises: :class:`FunctionNotSupported`
+        """
+        name = cstr(name, 'name')
+        cdef:
+            rados_ioctx_t _ioctx = convert_ioctx(ioctx)
+            char *_name = name
+            uint64_t _size = size
+            int _order = 0
+            rbd_image_options_t opts
+        if order is not None:
+            _order = order
+        if old_format:
+            if features != 0 or stripe_unit != 0 or stripe_count != 0:
+                raise InvalidArgument('format 1 images do not support feature'
+                                      ' masks or non-default striping')
+            with nogil:
+                ret = rbd_create(_ioctx, _name, _size, &_order)
+        else:
+            rbd_image_options_create(&opts)
+            try:
+                rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FORMAT,
+                                             1 if old_format else 2)
+                rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
+                                             features)
+                rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
+                                             _order)
+                rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
+                                             stripe_unit)
+                rbd_image_options_set_uint64(opts,
+                                             RBD_IMAGE_OPTION_STRIPE_COUNT,
+                                             stripe_count)
+                with nogil:
+                    ret = rbd_create4(_ioctx, _name, _size, opts)
+            finally:
+                rbd_image_options_destroy(opts)
+        if ret < 0:
+            raise make_ex(ret, 'error creating image')
+
+    def clone(self, p_ioctx, p_name, p_snapname, c_ioctx, c_name,
+              features=0, order=None, stripe_unit=0, stripe_count=0):
+        """
+        Clone a parent rbd snapshot into a COW sparse child.
+
+        :param p_ioctx: the parent context that represents the parent snap
+        :type ioctx: :class:`rados.Ioctx`
+        :param p_name: the parent image name
+        :type name: str
+        :param p_snapname: the parent image snapshot name
+        :type name: str
+        :param c_ioctx: the child context that represents the new clone
+        :type ioctx: :class:`rados.Ioctx`
+        :param c_name: the clone (child) name
+        :type name: str
+        :param features: bitmask of features to enable; if set, must include layering
+        :type features: int
+        :param order: the image is split into (2**order) byte objects
+        :type order: int
+        :param stripe_unit: stripe unit in bytes (default 0 for object size)
+        :type stripe_unit: int
+        :param stripe_count: objects to stripe over before looping
+        :type stripe_count: int
+        :raises: :class:`TypeError`
+        :raises: :class:`InvalidArgument`
+        :raises: :class:`ImageExists`
+        :raises: :class:`FunctionNotSupported`
+        :raises: :class:`ArgumentOutOfRange`
+        """
+        p_snapname = cstr(p_snapname, 'p_snapname')
+        p_name = cstr(p_name, 'p_name')
+        c_name = cstr(c_name, 'c_name')
+        cdef:
+            rados_ioctx_t _p_ioctx = convert_ioctx(p_ioctx)
+            rados_ioctx_t _c_ioctx = convert_ioctx(c_ioctx)
+            char *_p_name = p_name
+            char *_p_snapname = p_snapname
+            char *_c_name = c_name
+            rbd_image_options_t opts
+        if order is None:
+            order = 0
+
+        rbd_image_options_create(&opts)
+        try:
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
+                                         features)
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
+                                         order)
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
+                                         stripe_unit)
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_COUNT,
+                                         stripe_count)
+            with nogil:
+                ret = rbd_clone3(_p_ioctx, _p_name, _p_snapname,
+                                 _c_ioctx, _c_name, opts)
+        finally:
+            rbd_image_options_destroy(opts)
+        if ret < 0:
+            raise make_ex(ret, 'error creating clone')
+
+    def list(self, ioctx):
+        """
+        List image names.
+
+        :param ioctx: determines which RADOS pool is read
+        :type ioctx: :class:`rados.Ioctx`
+        :returns: list -- a list of image names
+        """
+        cdef:
+            rados_ioctx_t _ioctx = convert_ioctx(ioctx)
+            size_t size = 512
+            char *c_names = NULL
+        try:
+            while True:
+                c_names = <char *>realloc_chk(c_names, size)
+                with nogil:
+                    ret = rbd_list(_ioctx, c_names, &size)
+                if ret >= 0:
+                    break
+                elif ret != -errno.ERANGE:
+                    raise make_ex(ret, 'error listing images')
+            return [decode_cstr(name) for name in c_names[:ret].split('\0')
+                    if name]
+        finally:
+            free(c_names)
+
+    def remove(self, ioctx, name):
+        """
+        Delete an RBD image. This may take a long time, since it does
+        not return until every object that comprises the image has
+        been deleted. Note that all snapshots must be deleted before
+        the image can be removed. If there are snapshots left,
+        :class:`ImageHasSnapshots` is raised. If the image is still
+        open, or the watch from a crashed client has not expired,
+        :class:`ImageBusy` is raised.
+
+        :param ioctx: determines which RADOS pool the image is in
+        :type ioctx: :class:`rados.Ioctx`
+        :param name: the name of the image to remove
+        :type name: str
+        :raises: :class:`ImageNotFound`, :class:`ImageBusy`,
+                 :class:`ImageHasSnapshots`
+        """
+        name = cstr(name, 'name')
+        cdef:
+            rados_ioctx_t _ioctx = convert_ioctx(ioctx)
+            char *_name = name
+        with nogil:
+            ret = rbd_remove(_ioctx, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error removing image')
+
+    def rename(self, ioctx, src, dest):
+        """
+        Rename an RBD image.
+
+        :param ioctx: determines which RADOS pool the image is in
+        :type ioctx: :class:`rados.Ioctx`
+        :param src: the current name of the image
+        :type src: str
+        :param dest: the new name of the image
+        :type dest: str
+        :raises: :class:`ImageNotFound`, :class:`ImageExists`
+        """
+        src = cstr(src, 'src')
+        dest = cstr(dest, 'dest')
+        cdef:
+            rados_ioctx_t _ioctx = convert_ioctx(ioctx)
+            char *_src = src
+            char *_dest = dest
+        with nogil:
+            ret = rbd_rename(_ioctx, _src, _dest)
+        if ret != 0:
+            raise make_ex(ret, 'error renaming image')
+
+
+cdef int diff_iterate_cb(uint64_t offset, size_t length, int write, void *cb) \
+    except? -9000 with gil:
+    # Make sure that if we wound up with an exception from a previous callback,
+    # we stop calling back (just in case librbd ever fails to bail out on the
+    # first negative return, as older versions did)
+    if exc.PyErr_Occurred():
+        return -9000
+    ret = (<object>cb)(offset, length, bool(write))
+    if ret is None:
+        return 0
+    return ret
+
+
+cdef class Image(object):
+    """
+    This class represents an RBD image. It is used to perform I/O on
+    the image and interact with snapshots.
+
+    **Note**: Any method of this class may raise :class:`ImageNotFound`
+    if the image has been deleted.
+    """
+    cdef rbd_image_t image
+    cdef bint closed
+    cdef object name
+    cdef object ioctx
+    cdef rados_ioctx_t _ioctx
+
+    def __init__(self, ioctx, name, snapshot=None, read_only=False):
+        """
+        Open the image at the given snapshot.
+        If a snapshot is specified, the image will be read-only, unless
+        :func:`Image.set_snap` is called later.
+
+        If read-only mode is used, metadata for the :class:`Image`
+        object (such as which snapshots exist) may become obsolete. See
+        the C api for more details.
+
+        To clean up from opening the image, :func:`Image.close` should
+        be called.  For ease of use, this is done automatically when
+        an :class:`Image` is used as a context manager (see :pep:`343`).
+
+        :param ioctx: determines which RADOS pool the image is in
+        :type ioctx: :class:`rados.Ioctx`
+        :param name: the name of the image
+        :type name: str
+        :param snapshot: which snapshot to read from
+        :type snaphshot: str
+        :param read_only: whether to open the image in read-only mode
+        :type read_only: bool
+        """
+        name = cstr(name, 'name')
+        snapshot = cstr(snapshot, 'snapshot', opt=True)
+        self.closed = True
+        self.name = name
+        # Keep around a reference to the ioctx, so it won't get deleted
+        self.ioctx = ioctx
+        cdef:
+            rados_ioctx_t _ioctx = convert_ioctx(ioctx)
+            char *_name = name
+            char *_snapshot = opt_str(snapshot)
+        if read_only:
+            with nogil:
+                ret = rbd_open_read_only(_ioctx, _name, &self.image, _snapshot)
+        else:
+            with nogil:
+                ret = rbd_open(_ioctx, _name, &self.image, _snapshot)
+        if ret != 0:
+            raise make_ex(ret, 'error opening image %s at snapshot %s' % (name, snapshot))
+        self.closed = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        """
+        Closes the image. See :func:`close`
+        """
+        self.close()
+        return False
+
+    def close(self):
+        """
+        Release the resources used by this image object.
+
+        After this is called, this object should not be used.
+        """
+        if not self.closed:
+            with nogil:
+                ret = rbd_close(self.image)
+            if ret < 0:
+                raise make_ex(ret, 'error while closing image %s' % (
+                              self.name,))
+            self.closed = True
+
+    def __del__(self):
+        self.close()
+
+    def __repr__(self):
+        return "rbd.Image(ioctx, %r)" % self.name
+
+    def resize(self, size):
+        """
+        Change the size of the image.
+
+        :param size: the new size of the image
+        :type size: int
+        """
+        cdef uint64_t _size = size
+        with nogil:
+            ret = rbd_resize(self.image, _size)
+        if ret < 0:
+            raise make_ex(ret, 'error resizing image %s' % (self.name,))
+
+    def stat(self):
+        """
+        Get information about the image. Currently parent pool and
+        parent name are always -1 and ''.
+
+        :returns: dict - contains the following keys:
+
+            * ``size`` (int) - the size of the image in bytes
+
+            * ``obj_size`` (int) - the size of each object that comprises the
+              image
+
+            * ``num_objs`` (int) - the number of objects in the image
+
+            * ``order`` (int) - log_2(object_size)
+
+            * ``block_name_prefix`` (str) - the prefix of the RADOS objects used
+              to store the image
+
+            * ``parent_pool`` (int) - deprecated
+
+            * ``parent_name``  (str) - deprecated
+
+            See also :meth:`format` and :meth:`features`.
+
+        """
+        cdef rbd_image_info_t info
+        with nogil:
+            ret = rbd_stat(self.image, &info, sizeof(info))
+        if ret != 0:
+            raise make_ex(ret, 'error getting info for image %s' % (self.name,))
+        return {
+            'size'              : info.size,
+            'obj_size'          : info.obj_size,
+            'num_objs'          : info.num_objs,
+            'order'             : info.order,
+            'block_name_prefix' : decode_cstr(info.block_name_prefix),
+            'parent_pool'       : info.parent_pool,
+            'parent_name'       : info.parent_name
+            }
+
+    def parent_info(self):
+        """
+        Get information about a cloned image's parent (if any)
+
+        :returns: tuple - ``(pool name, image name, snapshot name)`` components
+                  of the parent image
+        :raises: :class:`ImageNotFound` if the image doesn't have a parent
+        """
+        cdef:
+            int ret = -errno.ERANGE
+            size_t size = 8
+            char *pool = NULL
+            char *name = NULL
+            char *snapname = NULL
+        try:
+            while ret == -errno.ERANGE and size <= 4096:
+                pool = <char *>realloc_chk(pool, size)
+                name = <char *>realloc_chk(name, size)
+                snapname = <char *>realloc_chk(snapname, size)
+                with nogil:
+                    ret = rbd_get_parent_info(self.image, pool, size, name,
+                                              size, snapname, size)
+                if ret == -errno.ERANGE:
+                    size *= 2
+
+            if ret != 0:
+                raise make_ex(ret, 'error getting parent info for image %s' % (self.name,))
+            return (decode_cstr(pool), decode_cstr(name), decode_cstr(snapname))
+        finally:
+            free(pool)
+            free(name)
+            free(snapname)
+
+    def old_format(self):
+        """
+        Find out whether the image uses the old RBD format.
+
+        :returns: bool - whether the image uses the old RBD format
+        """
+        cdef uint8_t old
+        with nogil:
+            ret = rbd_get_old_format(self.image, &old)
+        if ret != 0:
+            raise make_ex(ret, 'error getting old_format for image' % (self.name))
+        return old != 0
+
+    def size(self):
+        """
+        Get the size of the image. If open to a snapshot, returns the
+        size of that snapshot.
+
+        :returns: the size of the image in bytes
+        """
+        cdef uint64_t image_size
+        with nogil:
+            ret = rbd_get_size(self.image, &image_size)
+        if ret != 0:
+            raise make_ex(ret, 'error getting size for image' % (self.name))
+        return image_size
+
+    def features(self):
+        """
+        Gets the features bitmask of the image.
+
+        :returns: int - the features bitmask of the image
+        """
+        cdef uint64_t features
+        with nogil:
+            ret = rbd_get_features(self.image, &features)
+        if ret != 0:
+            raise make_ex(ret, 'error getting features for image' % (self.name))
+        return features
+
+    def update_features(self, features, enabled):
+        """
+        Updates the features bitmask of the image by enabling/disabling
+        a single feature.  The feature must support the ability to be
+        dynamically enabled/disabled.
+
+        :param features: feature bitmask to enable/disable
+        :type features: int
+        :param enabled: whether to enable/disable the feature
+        :type enabled: bool
+        :raises: :class:`InvalidArgument`
+        """
+        cdef:
+            uint64_t _features = features
+            uint8_t _enabled = bool(enabled)
+        with nogil:
+            ret = rbd_update_features(self.image, _features, _enabled)
+        if ret != 0:
+            raise make_ex(ret, 'error updating features for image %s' %
+                               (self.name))
+
+    def overlap(self):
+        """
+        Gets the number of overlapping bytes between the image and its parent
+        image. If open to a snapshot, returns the overlap between the snapshot
+        and the parent image.
+
+        :returns: int - the overlap in bytes
+        :raises: :class:`ImageNotFound` if the image doesn't have a parent
+        """
+        cdef uint64_t overlap
+        with nogil:
+            ret = rbd_get_overlap(self.image, &overlap)
+        if ret != 0:
+            raise make_ex(ret, 'error getting overlap for image' % (self.name))
+        return overlap
+
+    def flags(self):
+        """
+        Gets the flags bitmask of the image.
+
+        :returns: int - the flags bitmask of the image
+        """
+        cdef uint64_t flags
+        with nogil:
+            ret = rbd_get_flags(self.image, &flags)
+        if ret != 0:
+            raise make_ex(ret, 'error getting flags for image' % (self.name))
+        return flags
+
+    def is_exclusive_lock_owner(self):
+        """
+        Gets the status of the image exclusive lock.
+
+        :returns: bool - true if the image is exclusively locked
+        """
+        cdef int owner
+        with nogil:
+            ret = rbd_is_exclusive_lock_owner(self.image, &owner)
+        if ret != 0:
+            raise make_ex(ret, 'error getting lock status for image' % (self.name))
+        return owner == 1
+
+    def copy(self, dest_ioctx, dest_name, features=0, order=None, stripe_unit=0,
+             stripe_count=0):
+        """
+        Copy the image to another location.
+
+        :param dest_ioctx: determines which pool to copy into
+        :type dest_ioctx: :class:`rados.Ioctx`
+        :param dest_name: the name of the copy
+        :type dest_name: str
+        :param features: bitmask of features to enable; if set, must include layering
+        :type features: int
+        :param order: the image is split into (2**order) byte objects
+        :type order: int
+        :param stripe_unit: stripe unit in bytes (default 0 for object size)
+        :type stripe_unit: int
+        :param stripe_count: objects to stripe over before looping
+        :type stripe_count: int
+        :raises: :class:`TypeError`
+        :raises: :class:`InvalidArgument`
+        :raises: :class:`ImageExists`
+        :raises: :class:`FunctionNotSupported`
+        :raises: :class:`ArgumentOutOfRange`
+        """
+        if order is None:
+            order = 0
+        dest_name = cstr(dest_name, 'dest_name')
+        cdef:
+            rados_ioctx_t _dest_ioctx = convert_ioctx(dest_ioctx)
+            char *_dest_name = dest_name
+            rbd_image_options_t opts
+
+        rbd_image_options_create(&opts)
+        try:
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
+                                         features)
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
+                                         order)
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
+                                         stripe_unit)
+            rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_COUNT,
+                                         stripe_count)
+            with nogil:
+                ret = rbd_copy3(self.image, _dest_ioctx, _dest_name, opts)
+        finally:
+            rbd_image_options_destroy(opts)
+        if ret < 0:
+            raise make_ex(ret, 'error copying image %s to %s' % (self.name, dest_name))
+
+    def list_snaps(self):
+        """
+        Iterate over the snapshots of an image.
+
+        :returns: :class:`SnapIterator`
+        """
+        return SnapIterator(self)
+
+    def create_snap(self, name):
+        """
+        Create a snapshot of the image.
+
+        :param name: the name of the snapshot
+        :type name: str
+        :raises: :class:`ImageExists`
+        """
+        name = cstr(name, 'name')
+        cdef char *_name = name
+        with nogil:
+            ret = rbd_snap_create(self.image, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error creating snapshot %s from %s' % (name, self.name))
+
+    def rename_snap(self, srcname, dstname):
+        """
+        rename a snapshot of the image.
+
+        :param srcname: the src name of the snapshot
+        :type srcname: str
+        :param dstname: the dst name of the snapshot
+        :type dstname: str
+        :raises: :class:`ImageExists`
+        """
+        srcname = cstr(srcname, 'srcname')
+        dstname = cstr(dstname, 'dstname')
+        cdef:
+            char *_srcname = srcname
+            char *_dstname = dstname
+        with nogil:
+            ret = rbd_snap_rename(self.image, _srcname, _dstname)
+        if ret != 0:
+            raise make_ex(ret, 'error renaming snapshot of %s from %s to %s' % (self.name, srcname, dstname))
+
+    def remove_snap(self, name):
+        """
+        Delete a snapshot of the image.
+
+        :param name: the name of the snapshot
+        :type name: str
+        :raises: :class:`IOError`, :class:`ImageBusy`
+        """
+        name = cstr(name, 'name')
+        cdef char *_name = name
+        with nogil:
+            ret = rbd_snap_remove(self.image, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error removing snapshot %s from %s' % (name, self.name))
+
+    def rollback_to_snap(self, name):
+        """
+        Revert the image to its contents at a snapshot. This is a
+        potentially expensive operation, since it rolls back each
+        object individually.
+
+        :param name: the snapshot to rollback to
+        :type name: str
+        :raises: :class:`IOError`
+        """
+        name = cstr(name, 'name')
+        cdef char *_name = name
+        with nogil:
+            ret = rbd_snap_rollback(self.image, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error rolling back image %s to snapshot %s' % (self.name, name))
+
+    def protect_snap(self, name):
+        """
+        Mark a snapshot as protected. This means it can't be deleted
+        until it is unprotected.
+
+        :param name: the snapshot to protect
+        :type name: str
+        :raises: :class:`IOError`, :class:`ImageNotFound`
+        """
+        name = cstr(name, 'name')
+        cdef char *_name = name
+        with nogil:
+            ret = rbd_snap_protect(self.image, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error protecting snapshot %s@%s' % (self.name, name))
+
+    def unprotect_snap(self, name):
+        """
+        Mark a snapshot unprotected. This allows it to be deleted if
+        it was protected.
+
+        :param name: the snapshot to unprotect
+        :type name: str
+        :raises: :class:`IOError`, :class:`ImageNotFound`
+        """
+        name = cstr(name, 'name')
+        cdef char *_name = name
+        with nogil:
+            ret = rbd_snap_unprotect(self.image, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error unprotecting snapshot %s@%s' % (self.name, name))
+
+    def is_protected_snap(self, name):
+        """
+        Find out whether a snapshot is protected from deletion.
+
+        :param name: the snapshot to check
+        :type name: str
+        :returns: bool - whether the snapshot is protected
+        :raises: :class:`IOError`, :class:`ImageNotFound`
+        """
+        name = cstr(name, 'name')
+        cdef:
+            char *_name = name
+            int is_protected
+        with nogil:
+            ret = rbd_snap_is_protected(self.image, _name, &is_protected)
+        if ret != 0:
+            raise make_ex(ret, 'error checking if snapshot %s@%s is protected' % (self.name, name))
+        return is_protected == 1
+
+    def set_snap(self, name):
+        """
+        Set the snapshot to read from. Writes will raise ReadOnlyImage
+        while a snapshot is set. Pass None to unset the snapshot
+        (reads come from the current image) , and allow writing again.
+
+        :param name: the snapshot to read from, or None to unset the snapshot
+        :type name: str or None
+        """
+        name = cstr(name, 'name', opt=True)
+        cdef char *_name = opt_str(name)
+        with nogil:
+            ret = rbd_snap_set(self.image, _name)
+        if ret != 0:
+            raise make_ex(ret, 'error setting image %s to snapshot %s' % (self.name, name))
+
+    def read(self, offset, length, fadvise_flags=0):
+        """
+        Read data from the image. Raises :class:`InvalidArgument` if
+        part of the range specified is outside the image.
+
+        :param offset: the offset to start reading at
+        :type offset: int
+        :param length: how many bytes to read
+        :type length: int
+        :param fadvise_flags: fadvise flags for this read
+        :type fadvise_flags: int
+        :returns: str - the data read
+        :raises: :class:`InvalidArgument`, :class:`IOError`
+        """
+
+        # This usage of the Python API allows us to construct a string
+        # that librbd directly reads into, avoiding an extra copy. Although
+        # strings are normally immutable, this usage is explicitly supported
+        # for freshly created string objects.
+        cdef:
+            char *ret_buf
+            uint64_t _offset = offset
+            size_t _length = length
+            int _fadvise_flags = fadvise_flags
+            PyObject* ret_s = NULL
+        ret_s = PyBytes_FromStringAndSize(NULL, length)
+        try:
+            ret_buf = PyBytes_AsString(ret_s)
+            with nogil:
+                ret = rbd_read2(self.image, _offset, _length, ret_buf,
+                                _fadvise_flags)
+            if ret < 0:
+                raise make_ex(ret, 'error reading %s %ld~%ld' % (self.name, offset, length))
+
+            if ret != length:
+                _PyBytes_Resize(&ret_s, ret)
+
+            return <object>ret_s
+        finally:
+            # We DECREF unconditionally: the cast to object above will have
+            # INCREFed if necessary. This also takes care of exceptions,
+            # including if _PyString_Resize fails (that will free the string
+            # itself and set ret_s to NULL, hence XDECREF).
+            ref.Py_XDECREF(ret_s)
+
+    def diff_iterate(self, offset, length, from_snapshot, iterate_cb,
+                     include_parent = True, whole_object = False):
+        """
+        Iterate over the changed extents of an image.
+
+        This will call iterate_cb with three arguments:
+
+        (offset, length, exists)
+
+        where the changed extent starts at offset bytes, continues for
+        length bytes, and is full of data (if exists is True) or zeroes
+        (if exists is False).
+
+        If from_snapshot is None, it is interpreted as the beginning
+        of time and this generates all allocated extents.
+
+        The end version is whatever is currently selected (via set_snap)
+        for the image.
+
+        iterate_cb may raise an exception, which will abort the diff and will be
+        propagated to the caller.
+
+        Raises :class:`InvalidArgument` if from_snapshot is after
+        the currently set snapshot.
+
+        Raises :class:`ImageNotFound` if from_snapshot is not the name
+        of a snapshot of the image.
+
+        :param offset: start offset in bytes
+        :type offset: int
+        :param length: size of region to report on, in bytes
+        :type length: int
+        :param from_snapshot: starting snapshot name, or None
+        :type from_snapshot: str or None
+        :param iterate_cb: function to call for each extent
+        :type iterate_cb: function acception arguments for offset,
+                           length, and exists
+        :param include_parent: True if full history diff should include parent
+        :type include_parent: bool
+        :param whole_object: True if diff extents should cover whole object
+        :type whole_object: bool
+        :raises: :class:`InvalidArgument`, :class:`IOError`,
+                 :class:`ImageNotFound`
+        """
+        from_snapshot = cstr(from_snapshot, 'from_snapshot', opt=True)
+        cdef:
+            char *_from_snapshot = opt_str(from_snapshot)
+            uint64_t _offset = offset, _length = length
+            uint8_t _include_parent = include_parent
+            uint8_t _whole_object = whole_object
+        with nogil:
+            ret = rbd_diff_iterate2(self.image, _from_snapshot, _offset,
+                                    _length, _include_parent, _whole_object,
+                                    &diff_iterate_cb, <void *>iterate_cb)
+        if ret < 0:
+            msg = 'error generating diff from snapshot %s' % from_snapshot
+            raise make_ex(ret, msg)
+
+    def write(self, data, offset, fadvise_flags=0):
+        """
+        Write data to the image. Raises :class:`InvalidArgument` if
+        part of the write would fall outside the image.
+
+        :param data: the data to be written
+        :type data: bytes
+        :param offset: where to start writing data
+        :type offset: int
+        :param fadvise_flags: fadvise flags for this write
+        :type fadvise_flags: int
+        :returns: int - the number of bytes written
+        :raises: :class:`IncompleteWriteError`, :class:`LogicError`,
+                 :class:`InvalidArgument`, :class:`IOError`
+        """
+        if not isinstance(data, bytes):
+            raise TypeError('data must be a byte string')
+        cdef:
+            uint64_t _offset = offset, length = len(data)
+            char *_data = data
+            int _fadvise_flags = fadvise_flags
+        with nogil:
+            ret = rbd_write2(self.image, _offset, length, _data, _fadvise_flags)
+
+        if ret == length:
+            return ret
+        elif ret < 0:
+            raise make_ex(ret, "error writing to %s" % (self.name,))
+        elif ret < length:
+            raise IncompleteWriteError("Wrote only %ld out of %ld bytes" % (ret, length))
+        else:
+            raise LogicError("logic error: rbd_write(%s) \
+returned %d, but %d was the maximum number of bytes it could have \
+written." % (self.name, ret, length))
+
+    def discard(self, offset, length):
+        """
+        Trim the range from the image. It will be logically filled
+        with zeroes.
+        """
+        cdef uint64_t _offset = offset, _length = length
+        with nogil:
+            ret = rbd_discard(self.image, _offset, _length)
+        if ret < 0:
+            msg = 'error discarding region %d~%d' % (offset, length)
+            raise make_ex(ret, msg)
+
+    def flush(self):
+        """
+        Block until all writes are fully flushed if caching is enabled.
+        """
+        with nogil:
+            ret = rbd_flush(self.image)
+        if ret < 0:
+            raise make_ex(ret, 'error flushing image')
+
+    def invalidate_cache(self):
+        """
+        Drop any cached data for the image.
+        """
+        with nogil:
+            ret = rbd_invalidate_cache(self.image)
+        if ret < 0:
+            raise make_ex(ret, 'error invalidating cache')
+
+    def stripe_unit(self):
+        """
+        Returns the stripe unit used for the image.
+        """
+        cdef uint64_t stripe_unit
+        with nogil:
+            ret = rbd_get_stripe_unit(self.image, &stripe_unit)
+        if ret != 0:
+            raise make_ex(ret, 'error getting stripe unit for image' % (self.name))
+        return stripe_unit
+
+    def stripe_count(self):
+        """
+        Returns the stripe count used for the image.
+        """
+        cdef uint64_t stripe_count
+        with nogil:
+            ret = rbd_get_stripe_count(self.image, &stripe_count)
+        if ret != 0:
+            raise make_ex(ret, 'error getting stripe count for image' % (self.name))
+        return stripe_count
+
+    def flatten(self):
+        """
+        Flatten clone image (copy all blocks from parent to child)
+        """
+        with nogil:
+            ret = rbd_flatten(self.image)
+        if ret < 0:
+            raise make_ex(ret, "error flattening %s" % self.name)
+
+    def list_children(self):
+        """
+        List children of the currently set snapshot (set via set_snap()).
+
+        :returns: list - a list of (pool name, image name) tuples
+        """
+        cdef:
+            size_t pools_size = 512, images_size = 512
+            char *c_pools = NULL
+            char *c_images = NULL
+        try:
+            while True:
+                c_pools = <char *>realloc_chk(c_pools, pools_size)
+                c_images = <char *>realloc_chk(c_images, images_size)
+                with nogil:
+                    ret = rbd_list_children(self.image, c_pools, &pools_size,
+                                            c_images, &images_size)
+                if ret >= 0:
+                    break
+                elif ret != -errno.ERANGE:
+                    raise make_ex(ret, 'error listing images')
+            if ret == 0:
+                return []
+            pools = map(decode_cstr, c_pools[:pools_size - 1].split('\0'))
+            images = map(decode_cstr, c_images[:images_size - 1].split('\0'))
+            return list(zip(pools, images))
+        finally:
+            free(c_pools)
+            free(c_images)
+
+    def list_lockers(self):
+        """
+        List clients that have locked the image and information
+        about the lock.
+
+        :returns: dict - contains the following keys:
+
+                  * ``tag`` - the tag associated with the lock (every
+                    additional locker must use the same tag)
+                  * ``exclusive`` - boolean indicating whether the
+                     lock is exclusive or shared
+                  * ``lockers`` - a list of (client, cookie, address)
+                    tuples
+        """
+        cdef:
+            size_t clients_size = 512, cookies_size = 512
+            size_t addrs_size = 512, tag_size = 512
+            int exclusive = 0
+            char *c_clients = NULL
+            char *c_cookies = NULL
+            char *c_addrs = NULL
+            char *c_tag = NULL
+
+        try:
+            while True:
+                c_clients = <char *>realloc_chk(c_clients, clients_size)
+                c_cookies = <char *>realloc_chk(c_cookies, cookies_size)
+                c_addrs = <char *>realloc_chk(c_addrs, addrs_size)
+                c_tag = <char *>realloc_chk(c_tag, tag_size)
+                with nogil:
+                    ret = rbd_list_lockers(self.image, &exclusive,
+                                           c_tag, &tag_size,
+                                           c_clients, &clients_size,
+                                           c_cookies, &cookies_size,
+                                           c_addrs, &addrs_size)
+                if ret >= 0:
+                    break
+                elif ret != -errno.ERANGE:
+                    raise make_ex(ret, 'error listing images')
+            if ret == 0:
+                return []
+            clients = map(decode_cstr, c_clients[:clients_size - 1].split('\0'))
+            cookies = map(decode_cstr, c_cookies[:cookies_size - 1].split('\0'))
+            addrs = map(decode_cstr, c_addrs[:addrs_size - 1].split('\0'))
+            return {
+                'tag'       : decode_cstr(c_tag),
+                'exclusive' : exclusive == 1,
+                'lockers'   : list(zip(clients, cookies, addrs)),
+                }
+        finally:
+            free(c_clients)
+            free(c_cookies)
+            free(c_addrs)
+            free(c_tag)
+
+    def lock_exclusive(self, cookie):
+        """
+        Take an exclusive lock on the image.
+
+        :raises: :class:`ImageBusy` if a different client or cookie locked it
+                 :class:`ImageExists` if the same client and cookie locked it
+        """
+        cookie = cstr(cookie, 'cookie')
+        cdef char *_cookie = cookie
+        with nogil:
+            ret = rbd_lock_exclusive(self.image, _cookie)
+        if ret < 0:
+            raise make_ex(ret, 'error acquiring exclusive lock on image')
+
+    def lock_shared(self, cookie, tag):
+        """
+        Take a shared lock on the image. The tag must match
+        that of the existing lockers, if any.
+
+        :raises: :class:`ImageBusy` if a different client or cookie locked it
+                 :class:`ImageExists` if the same client and cookie locked it
+        """
+        cookie = cstr(cookie, 'cookie')
+        tag = cstr(tag, 'tag')
+        cdef:
+            char *_cookie = cookie
+            char *_tag = tag
+        with nogil:
+            ret = rbd_lock_shared(self.image, _cookie, _tag)
+        if ret < 0:
+            raise make_ex(ret, 'error acquiring shared lock on image')
+
+    def unlock(self, cookie):
+        """
+        Release a lock on the image that was locked by this rados client.
+        """
+        cookie = cstr(cookie, 'cookie')
+        cdef char *_cookie = cookie
+        with nogil:
+            ret = rbd_unlock(self.image, _cookie)
+        if ret < 0:
+            raise make_ex(ret, 'error unlocking image')
+
+    def break_lock(self, client, cookie):
+        """
+        Release a lock held by another rados client.
+        """
+        client = cstr(client, 'client')
+        cookie = cstr(cookie, 'cookie')
+        cdef:
+            char *_client = client
+            char *_cookie = cookie
+        with nogil:
+            ret = rbd_break_lock(self.image, _client, _cookie)
+        if ret < 0:
+            raise make_ex(ret, 'error unlocking image')
+
+
+cdef class SnapIterator(object):
+    """
+    Iterator over snapshot info for an image.
+
+    Yields a dictionary containing information about a snapshot.
+
+    Keys are:
+
+    * ``id`` (int) - numeric identifier of the snapshot
+
+    * ``size`` (int) - size of the image at the time of snapshot (in bytes)
+
+    * ``name`` (str) - name of the snapshot
+    """
+
+    cdef rbd_snap_info_t *snaps
+    cdef int num_snaps
+
+    def __init__(self, Image image):
+        self.snaps = NULL
+        self.num_snaps = 10
+        while True:
+            self.snaps = <rbd_snap_info_t*>realloc_chk(self.snaps,
+                                                   self.num_snaps *
+                                                   sizeof(rbd_snap_info_t))
+            with nogil:
+                ret = rbd_snap_list(image.image, self.snaps, &self.num_snaps)
+            if ret >= 0:
+                self.num_snaps = ret
+                break
+            elif ret != -errno.ERANGE:
+                raise make_ex(ret, 'error listing snapshots for image %s' % (image.name,))
+
+    def __iter__(self):
+        for i in range(self.num_snaps):
+            yield {
+                'id'   : self.snaps[i].id,
+                'size' : self.snaps[i].size,
+                'name' : decode_cstr(self.snaps[i].name),
+                }
+
+    def __del__(self):
+        if self.snaps:
+            rbd_snap_list_end(self.snaps)
+            free(self.snaps)
diff --git a/src/pybind/setup.py b/src/pybind/setup.py
new file mode 100755
index 0000000..1eda454
--- /dev/null
+++ b/src/pybind/setup.py
@@ -0,0 +1,51 @@
+# Largely taken from
+# https://blog.kevin-brown.com/programming/2014/09/24/combining-autotools-and-setuptools.html
+import os, sys, os.path
+
+from setuptools.command.egg_info import egg_info
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Build import cythonize
+
+def get_version():
+    try:
+        for line in open(os.path.join(os.path.dirname(__file__), "..", "ceph_ver.h")):
+            if "CEPH_GIT_NICE_VER" in line:
+                return line.split()[2][1:-1]
+        else:
+            return "0"
+    except IOError:
+        return "0"
+
+class EggInfoCommand(egg_info):
+    def finalize_options(self):
+        egg_info.finalize_options(self)
+        if "build" in self.distribution.command_obj:
+            build_command = self.distribution.command_obj["build"]
+            self.egg_base = build_command.build_base
+            self.egg_info = os.path.join(self.egg_base, os.path.basename(self.egg_info))
+
+# Disable cythonification if we're not really building anything
+if (len(sys.argv) >= 2 and
+    any(i in sys.argv[1:] for i in ('--help', 'clean', 'egg_info', '--version')
+    )):
+    def cythonize(x, **kwargs):
+        return x
+
+setup(
+    name = 'rbd',
+    version = get_version(),
+    description = "Python libraries for the Ceph librbd library",
+    long_description = (
+        "This package contains Python libraries for interacting with Ceph's "
+        "RBD block device library."),
+    ext_modules = cythonize([
+        Extension("rbd",
+            ["rbd.pyx"],
+            libraries=["rbd"]
+            )
+    ], build_dir=os.environ.get("CYTHON_BUILD_DIR", None)),
+    cmdclass={
+        "egg_info": EggInfoCommand,
+    },
+)
diff --git a/src/rbd_replay/ActionTypes.h b/src/rbd_replay/ActionTypes.h
index 63ef34e..fcceca8 100644
--- a/src/rbd_replay/ActionTypes.h
+++ b/src/rbd_replay/ActionTypes.h
@@ -5,7 +5,7 @@
 #define CEPH_RBD_REPLAY_ACTION_TYPES_H
 
 #include "include/int_types.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "include/encoding.h"
 #include <iosfwd>
 #include <list>
diff --git a/src/rbd_replay/ios.hpp b/src/rbd_replay/ios.hpp
index 1755933..218717b 100644
--- a/src/rbd_replay/ios.hpp
+++ b/src/rbd_replay/ios.hpp
@@ -18,7 +18,7 @@
 // This code assumes that IO IDs and timestamps are related monotonically.
 // In other words, (a.id < b.id) == (a.timestamp < b.timestamp) for all IOs a and b.
 
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include <boost/enable_shared_from_this.hpp>
 #include <boost/shared_ptr.hpp>
 #include <iostream>
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index 3a30156..98cd4b0 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -4,6 +4,7 @@ if ENABLE_CLIENT
 DENCODER_SOURCES += \
 	rgw/rgw_dencoder.cc \
 	rgw/rgw_acl.cc \
+	rgw/rgw_basic_types.cc \
 	rgw/rgw_common.cc \
 	rgw/rgw_env.cc \
 	rgw/rgw_json_enc.cc
@@ -29,6 +30,7 @@ librgw_la_SOURCES =  \
 	rgw/rgw_rest_client.cc \
 	rgw/rgw_rest_conn.cc \
 	rgw/rgw_op.cc \
+	rgw/rgw_basic_types.cc \
 	rgw/rgw_common.cc \
 	rgw/rgw_cache.cc \
 	rgw/rgw_formats.cc \
@@ -122,7 +124,6 @@ ceph_rgw_jsonparser_SOURCES = \
 ceph_rgw_jsonparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 bin_DEBUGPROGRAMS += ceph_rgw_jsonparser
 
-
 noinst_HEADERS += \
 	rgw/rgw_acl.h \
 	rgw/rgw_acl_s3.h \
@@ -130,6 +131,7 @@ noinst_HEADERS += \
 	rgw/rgw_client_io.h \
 	rgw/rgw_fcgi.h \
 	rgw/rgw_xml.h \
+	rgw/rgw_basic_types.h \
 	rgw/rgw_cache.h \
 	rgw/rgw_common.h \
 	rgw/rgw_cors.h \
diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc
index 669a83d..d117caa 100644
--- a/src/rgw/rgw_acl.cc
+++ b/src/rgw/rgw_acl.cc
@@ -29,26 +29,26 @@ void RGWAccessControlList::_add_grant(ACLGrant *grant)
     break;
   default:
     {
-      string id;
+      rgw_user id;
       if (!grant->get_id(id)) {
         ldout(cct, 0) << "ERROR: grant->get_id() failed" << dendl;
       }
-      acl_user_map[id] |= perm.get_permissions();
+      acl_user_map[id.to_str()] |= perm.get_permissions();
     }
   }
 }
 
 void RGWAccessControlList::add_grant(ACLGrant *grant)
 {
-  string id;
+  rgw_user id;
   grant->get_id(id); // not that this will return false for groups, but that's ok, we won't search groups
-  grant_map.insert(pair<string, ACLGrant>(id, *grant));
+  grant_map.insert(pair<string, ACLGrant>(id.to_str(), *grant));
   _add_grant(grant);
 }
 
-int RGWAccessControlList::get_perm(string& id, int perm_mask) {
+int RGWAccessControlList::get_perm(rgw_user& id, int perm_mask) {
   ldout(cct, 5) << "Searching permissions for uid=" << id << " mask=" << perm_mask << dendl;
-  map<string, int>::iterator iter = acl_user_map.find(id);
+  map<string, int>::iterator iter = acl_user_map.find(id.to_str());
   if (iter != acl_user_map.end()) {
     ldout(cct, 5) << "Found permission: " << iter->second << dendl;
     return iter->second & perm_mask;
@@ -68,7 +68,7 @@ int RGWAccessControlList::get_group_perm(ACLGroupTypeEnum group, int perm_mask)
   return 0;
 }
 
-int RGWAccessControlPolicy::get_perm(string& id, int perm_mask) {
+int RGWAccessControlPolicy::get_perm(rgw_user& id, int perm_mask) {
   int perm = acl.get_perm(id, perm_mask);
 
   if (id.compare(owner.get_id()) == 0) {
@@ -82,7 +82,7 @@ int RGWAccessControlPolicy::get_perm(string& id, int perm_mask) {
   if ((perm & perm_mask) != perm_mask) {
     perm |= acl.get_group_perm(ACL_GROUP_ALL_USERS, perm_mask);
 
-    if (!compare_group_name(id, ACL_GROUP_ALL_USERS)) {
+    if (!compare_group_name(id.id, ACL_GROUP_ALL_USERS)) {
       /* this is not the anonymous user */
       perm |= acl.get_group_perm(ACL_GROUP_AUTHENTICATED_USERS, perm_mask);
     }
@@ -93,7 +93,7 @@ int RGWAccessControlPolicy::get_perm(string& id, int perm_mask) {
   return perm;
 }
 
-bool RGWAccessControlPolicy::verify_permission(string& uid, int user_perm_mask, int perm)
+bool RGWAccessControlPolicy::verify_permission(rgw_user& uid, int user_perm_mask, int perm)
 {
   int test_perm = perm | RGW_PERM_READ_OBJS | RGW_PERM_WRITE_OBJS;
 
diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h
index d4a4643..fc2a7ef 100644
--- a/src/rgw/rgw_acl.h
+++ b/src/rgw/rgw_acl.h
@@ -10,6 +10,8 @@
 
 #include "common/debug.h"
 
+#include "rgw_basic_types.h"
+
 using namespace std;
 
 #define RGW_PERM_NONE            0x00
@@ -102,7 +104,7 @@ class ACLGrant
 {
 protected:
   ACLGranteeType type;
-  string id;
+  rgw_user id;
   string email;
   ACLPermission permission;
   string name;
@@ -114,10 +116,10 @@ public:
 
   /* there's an assumption here that email/uri/id encodings are
      different and there can't be any overlap */
-  bool get_id(string& _id) {
+  bool get_id(rgw_user& _id) {
     switch(type.get_type()) {
     case ACL_TYPE_EMAIL_USER:
-      _id = email;
+      _id = email; // implies from_str() that parses the 't:u' syntax
       return true;
     case ACL_TYPE_GROUP:
       return false;
@@ -131,9 +133,11 @@ public:
   ACLGroupTypeEnum get_group() { return group; }
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(3, 3, bl);
+    ENCODE_START(4, 3, bl);
     ::encode(type, bl);
-    ::encode(id, bl);
+    string s;
+    id.to_str(s);
+    ::encode(s, bl);
     string uri;
     ::encode(uri, bl);
     ::encode(email, bl);
@@ -144,9 +148,11 @@ public:
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
     ::decode(type, bl);
-    ::decode(id, bl);
+    string s;
+    ::decode(s, bl);
+    id.from_str(s);
     string uri;
     ::decode(uri, bl);
     ::decode(email, bl);
@@ -166,7 +172,7 @@ public:
 
   ACLGroupTypeEnum uri_to_group(string& uri);
   
-  void set_canon(string& _id, string& _name, int perm) {
+  void set_canon(const rgw_user& _id, string& _name, int perm) {
     type.set(ACL_TYPE_CANON_USER);
     id = _id;
     name = _name;
@@ -198,7 +204,7 @@ public:
 
   virtual ~RGWAccessControlList() {}
 
-  int get_perm(string& id, int perm_mask);
+  int get_perm(rgw_user& id, int perm_mask);
   int get_group_perm(ACLGroupTypeEnum group, int perm_mask);
   void encode(bufferlist& bl) const {
     ENCODE_START(3, 3, bl);
@@ -233,7 +239,7 @@ public:
 
   multimap<string, ACLGrant>& get_grant_map() { return grant_map; }
 
-  void create_default(string id, string name) {
+  void create_default(const rgw_user& id, string name) {
     acl_user_map.clear();
     acl_group_map.clear();
 
@@ -247,30 +253,34 @@ WRITE_CLASS_ENCODER(RGWAccessControlList)
 class ACLOwner
 {
 protected:
-  string id;
+  rgw_user id;
   string display_name;
 public:
   ACLOwner() {}
   ~ACLOwner() {}
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(2, 2, bl);
-    ::encode(id, bl);
+    ENCODE_START(3, 2, bl);
+    string s;
+    id.to_str(s);
+    ::encode(s, bl);
     ::encode(display_name, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
-    ::decode(id, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+    string s;
+    ::decode(s, bl);
+    id.from_str(s);
     ::decode(display_name, bl);
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
   static void generate_test_instances(list<ACLOwner*>& o);
-  void set_id(const string& _id) { id = _id; }
+  void set_id(const rgw_user& _id) { id = _id; }
   void set_name(string& name) { display_name = name; }
 
-  string& get_id() { return id; }
+  rgw_user& get_id() { return id; }
   string& get_display_name() { return display_name; }
 };
 WRITE_CLASS_ENCODER(ACLOwner)
@@ -292,9 +302,9 @@ public:
     acl.set_ctx(ctx);
   }
 
-  int get_perm(string& id, int perm_mask);
+  int get_perm(rgw_user& id, int perm_mask);
   int get_group_perm(ACLGroupTypeEnum group, int perm_mask);
-  bool verify_permission(string& uid, int user_perm_mask, int perm);
+  bool verify_permission(rgw_user& uid, int user_perm_mask, int perm);
 
   void encode(bufferlist& bl) const {
     ENCODE_START(2, 2, bl);
@@ -321,7 +331,7 @@ public:
     return owner;
   }
 
-  void create_default(string& id, string& name) {
+  void create_default(const rgw_user& id, string& name) {
     acl.create_default(id, name);
     owner.set_id(id);
     owner.set_name(name);
diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc
index 490bf98..aecb7a3 100644
--- a/src/rgw/rgw_acl_s3.cc
+++ b/src/rgw/rgw_acl_s3.cc
@@ -141,9 +141,11 @@ bool ACLOwner_S3::xml_end(const char *el) {
 }
 
 void  ACLOwner_S3::to_xml(ostream& out) {
-  if (id.empty())
+  string s;
+  id.to_str(s);
+  if (s.empty())
     return;
-  out << "<Owner>" << "<ID>" << id << "</ID>";
+  out << "<Owner>" << "<ID>" << s << "</ID>";
   if (!display_name.empty())
     out << "<DisplayName>" << display_name << "</DisplayName>";
   out << "</Owner>";
@@ -311,7 +313,8 @@ static int parse_grantee_str(RGWRados *store, string& grantee_str,
 
     grant.set_canon(info.user_id, info.display_name, rgw_perm);
   } else if (strcasecmp(id_type.c_str(), "id") == 0) {
-    ret = rgw_get_user_info_by_uid(store, id_val, info);
+    rgw_user user(id_val);
+    ret = rgw_get_user_info_by_uid(store, user, info);
     if (ret < 0)
       return ret;
 
@@ -361,7 +364,7 @@ int RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_own
 
   ACLGrant owner_grant;
 
-  string bid = bucket_owner.get_id();
+  rgw_user bid = bucket_owner.get_id();
   string bname = bucket_owner.get_display_name();
 
   /* owner gets full control */
@@ -476,7 +479,7 @@ int RGWAccessControlPolicy_S3::rebuild(RGWRados *store, ACLOwner *owner, RGWAcce
 
   ACLOwner *requested_owner = static_cast<ACLOwner_S3 *>(find_first("Owner"));
   if (requested_owner) {
-    const string& requested_id = requested_owner->get_id();
+    rgw_user& requested_id = requested_owner->get_id();
     if (!requested_id.empty() && requested_id.compare(owner->get_id()) != 0)
       return -EPERM;
   }
@@ -502,16 +505,18 @@ int RGWAccessControlPolicy_S3::rebuild(RGWRados *store, ACLOwner *owner, RGWAcce
     ACLGranteeType& type = src_grant.get_type();
     ACLGrant new_grant;
     bool grant_ok = false;
-    string uid;
+    rgw_user uid;
     RGWUserInfo grant_user;
     switch (type.get_type()) {
     case ACL_TYPE_EMAIL_USER:
       {
         string email;
-        if (!src_grant.get_id(email)) {
+        rgw_user u;
+        if (!src_grant.get_id(u)) {
           ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl;
           return -EINVAL;
         }
+        email = u.id;
         ldout(cct, 10) << "grant user email=" << email << dendl;
         if (rgw_get_user_info_by_email(store, email, grant_user) < 0) {
           ldout(cct, 10) << "grant user email not found or other error" << dendl;
@@ -535,7 +540,7 @@ int RGWAccessControlPolicy_S3::rebuild(RGWRados *store, ACLOwner *owner, RGWAcce
           ACLPermission& perm = src_grant.get_permission();
           new_grant.set_canon(uid, grant_user.display_name, perm.get_permissions());
           grant_ok = true;
-          string new_id;
+          rgw_user new_id;
           new_grant.get_id(new_id);
           ldout(cct, 10) << "new grant: " << new_id << ":" << grant_user.display_name << dendl;
         }
diff --git a/src/rgw/rgw_acl_swift.cc b/src/rgw/rgw_acl_swift.cc
index bdfa8f7..1f8f1ab 100644
--- a/src/rgw/rgw_acl_swift.cc
+++ b/src/rgw/rgw_acl_swift.cc
@@ -3,7 +3,7 @@
 
 #include <string.h>
 
-#include <vector>
+#include <list>
 
 #include "rgw_common.h"
 #include "rgw_user.h"
@@ -18,7 +18,7 @@ using namespace std;
 
 #define SWIFT_GROUP_ALL_USERS ".r:*"
 
-static int parse_list(string& uid_list, vector<string>& uids)
+static int parse_list(string& uid_list, list<string>& uids)
 {
   char *s = strdup(uid_list.c_str());
   if (!s)
@@ -56,9 +56,9 @@ static bool uid_is_public(string& uid)
          sub.compare(".referrer") == 0;
 }
 
-void RGWAccessControlPolicy_SWIFT::add_grants(RGWRados *store, vector<string>& uids, int perm)
+void RGWAccessControlPolicy_SWIFT::add_grants(RGWRados *store, list<string>& uids, int perm)
 {
-  vector<string>::iterator iter;
+  list<string>::iterator iter;
   for (iter = uids.begin(); iter != uids.end(); ++iter ) {
     ACLGrant grant;
     RGWUserInfo grant_user;
@@ -66,24 +66,27 @@ void RGWAccessControlPolicy_SWIFT::add_grants(RGWRados *store, vector<string>& u
     if (uid_is_public(uid)) {
       grant.set_group(ACL_GROUP_ALL_USERS, perm);
       acl.add_grant(&grant);
-    } else if (rgw_get_user_info_by_uid(store, uid, grant_user) < 0) {
-      ldout(cct, 10) << "grant user does not exist:" << uid << dendl;
-      /* skipping silently */
-    } else {
-      grant.set_canon(uid, grant_user.display_name, perm);
-      acl.add_grant(&grant);
+    } else  {
+      rgw_user user(uid);
+      if (rgw_get_user_info_by_uid(store, user, grant_user) < 0) {
+        ldout(cct, 10) << "grant user does not exist:" << uid << dendl;
+        /* skipping silently */
+      } else {
+        grant.set_canon(user, grant_user.display_name, perm);
+        acl.add_grant(&grant);
+      }
     }
   }
 }
 
-bool RGWAccessControlPolicy_SWIFT::create(RGWRados *store, string& id, string& name, string& read_list, string& write_list)
+bool RGWAccessControlPolicy_SWIFT::create(RGWRados *store, rgw_user& id, string& name, string& read_list, string& write_list)
 {
   acl.create_default(id, name);
   owner.set_id(id);
   owner.set_name(name);
 
   if (read_list.size()) {
-    vector<string> uids;
+    list<string> uids;
     int r = parse_list(read_list, uids);
     if (r < 0) {
       ldout(cct, 0) << "ERROR: parse_list returned r=" << r << dendl;
@@ -93,7 +96,7 @@ bool RGWAccessControlPolicy_SWIFT::create(RGWRados *store, string& id, string& n
     add_grants(store, uids, SWIFT_PERM_READ);
   }
   if (write_list.size()) {
-    vector<string> uids;
+    list<string> uids;
     int r = parse_list(write_list, uids);
     if (r < 0) {
       ldout(cct, 0) << "ERROR: parse_list returned r=" << r << dendl;
@@ -113,7 +116,7 @@ void RGWAccessControlPolicy_SWIFT::to_str(string& read, string& write)
   for (iter = m.begin(); iter != m.end(); ++iter) {
     ACLGrant& grant = iter->second;
     int perm = grant.get_permission().get_permissions();
-    string id;
+    rgw_user id;
     if (!grant.get_id(id)) {
       if (grant.get_group() != ACL_GROUP_ALL_USERS)
         continue;
@@ -122,11 +125,11 @@ void RGWAccessControlPolicy_SWIFT::to_str(string& read, string& write)
     if (perm & SWIFT_PERM_READ) {
       if (!read.empty())
         read.append(", ");
-      read.append(id);
+      read.append(id.to_str());
     } else if (perm & SWIFT_PERM_WRITE) {
       if (!write.empty())
         write.append(", ");
-      write.append(id);
+      write.append(id.to_str());
     }
   }
 }
diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h
index 9a5fbf7..cbadfaa 100644
--- a/src/rgw/rgw_acl_swift.h
+++ b/src/rgw/rgw_acl_swift.h
@@ -4,9 +4,9 @@
 #ifndef CEPH_RGW_ACL_SWIFT_H
 #define CEPH_RGW_ACL_SWIFT_H
 
+#include <list>
 #include <map>
 #include <string>
-#include <vector>
 #include <include/types.h>
 
 #include "rgw_acl.h"
@@ -19,8 +19,8 @@ public:
   RGWAccessControlPolicy_SWIFT(CephContext *_cct) : RGWAccessControlPolicy(_cct) {}
   ~RGWAccessControlPolicy_SWIFT() {}
 
-  void add_grants(RGWRados *store, vector<string>& uids, int perm);
-  bool create(RGWRados *store, string& id, string& name, string& read_list, string& write_list);
+  void add_grants(RGWRados *store, list<string>& uids, int perm);
+  bool create(RGWRados *store, rgw_user& id, string& name, string& read_list, string& write_list);
   void to_str(string& read, string& write);
 };
 
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index ca439d0..5f34bc7 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -590,7 +590,7 @@ int bucket_stats(rgw_bucket& bucket, Formatter *formatter)
   RGWBucketInfo bucket_info;
   time_t mtime;
   RGWObjectCtx obj_ctx(store);
-  int r = store->get_bucket_info(obj_ctx, bucket.name, bucket_info, &mtime);
+  int r = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, &mtime);
   if (r < 0)
     return r;
 
@@ -609,7 +609,7 @@ int bucket_stats(rgw_bucket& bucket, Formatter *formatter)
   
   formatter->dump_string("id", bucket.bucket_id);
   formatter->dump_string("marker", bucket.marker);
-  formatter->dump_string("owner", bucket_info.owner);
+  ::encode_json("owner", bucket_info.owner, formatter);
   formatter->dump_int("mtime", mtime);
   formatter->dump_string("ver", bucket_ver);
   formatter->dump_string("master_ver", master_ver);
@@ -629,14 +629,14 @@ public:
   }
 };
 
-static int init_bucket(const string& bucket_name, const string& bucket_id,
+static int init_bucket(const string& tenant_name, const string& bucket_name, const string& bucket_id,
                        RGWBucketInfo& bucket_info, rgw_bucket& bucket)
 {
   if (!bucket_name.empty()) {
     RGWObjectCtx obj_ctx(store);
     int r;
     if (bucket_id.empty()) {
-      r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL);
+      r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL);
     } else {
       string bucket_instance_id = bucket_name + ":" + bucket_id;
       r = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, bucket_info, NULL, NULL);
@@ -810,13 +810,15 @@ void set_quota_info(RGWQuotaInfo& quota, int opt_cmd, int64_t max_size, int64_t
   }
 }
 
-int set_bucket_quota(RGWRados *store, int opt_cmd, string& bucket_name, int64_t max_size, int64_t max_objects,
+int set_bucket_quota(RGWRados *store, int opt_cmd,
+                     const string& tenant_name, const string& bucket_name,
+                     int64_t max_size, int64_t max_objects,
                      bool have_max_size, bool have_max_objects)
 {
   RGWBucketInfo bucket_info;
   map<string, bufferlist> attrs;
   RGWObjectCtx obj_ctx(store);
-  int r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL, &attrs);
+  int r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, &attrs);
   if (r < 0) {
     cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
     return -r;
@@ -1004,7 +1006,8 @@ int check_obj_tail_locator_underscore(RGWBucketInfo& bucket_info, rgw_obj& obj,
   return 0;
 }
 
-int do_check_object_locator(const string& bucket_name, bool fix, bool remove_bad, Formatter *f)
+int do_check_object_locator(const string& tenant_name, const string& bucket_name,
+                            bool fix, bool remove_bad, Formatter *f)
 {
   if (remove_bad && !fix) {
     cerr << "ERROR: can't have remove_bad specified without fix" << std::endl;
@@ -1017,7 +1020,7 @@ int do_check_object_locator(const string& bucket_name, bool fix, bool remove_bad
 
   f->open_object_section("bucket");
   f->dump_string("bucket", bucket_name);
-  int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+  int ret = init_bucket(tenant_name, bucket_name, bucket_id, bucket_info, bucket);
   if (ret < 0) {
     cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
     return ret;
@@ -1078,7 +1081,6 @@ int do_check_object_locator(const string& bucket_name, bool fix, bool remove_bad
   return 0;
 }
 
-
 int main(int argc, char **argv) 
 {
   vector<const char*> args;
@@ -1088,7 +1090,9 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
-  std::string user_id, access_key, secret_key, user_email, display_name;
+  rgw_user user_id;
+  string tenant;
+  std::string access_key, secret_key, user_email, display_name;
   std::string bucket_name, pool_name, object;
   std::string date, subuser, access, format;
   std::string start_date, end_date;
@@ -1178,7 +1182,9 @@ int main(int argc, char **argv)
       usage();
       return 0;
     } else if (ceph_argparse_witharg(args, i, &val, "-i", "--uid", (char*)NULL)) {
-      user_id = val;
+      user_id.from_str(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--tenant", (char*)NULL)) {
+      tenant = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--access-key", (char*)NULL)) {
       access_key = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--subuser", (char*)NULL)) {
@@ -1387,6 +1393,15 @@ int main(int argc, char **argv)
       ++i;
     }
   }
+  if (tenant.empty()) {
+    tenant = user_id.tenant;
+  } else {
+    if (user_id.empty()) {
+      cerr << "ERROR: --tennant is set, but there's no user ID" << std::endl;
+      return EINVAL;
+    }
+    user_id.tenant = tenant;
+  }
 
   if (args.empty()) {
     return usage();
@@ -1892,7 +1907,7 @@ int main(int argc, char **argv)
       RGWBucketAdminOp::info(store, bucket_op, f);
     } else {
       RGWBucketInfo bucket_info;
-      int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+      int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
       if (ret < 0) {
         cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
         return -ret;
@@ -2037,7 +2052,7 @@ int main(int argc, char **argv)
 	return -r;
       }
       formatter->dump_string("bucket_id", entry.bucket_id);
-      formatter->dump_string("bucket_owner", entry.bucket_owner);
+      formatter->dump_string("bucket_owner", entry.bucket_owner.to_str());
       formatter->dump_string("bucket", entry.bucket);
 
       uint64_t agg_time = 0;
@@ -2214,7 +2229,7 @@ next:
       return EINVAL;
     }
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2261,7 +2276,7 @@ next:
 
   if (opt_cmd == OPT_BI_GET) {
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2285,7 +2300,7 @@ next:
 
   if (opt_cmd == OPT_BI_PUT) {
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2310,7 +2325,7 @@ next:
 
   if (opt_cmd == OPT_BI_LIST) {
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2347,7 +2362,7 @@ next:
 
   if (opt_cmd == OPT_OBJECT_RM) {
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2372,7 +2387,7 @@ next:
     }
 
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2413,7 +2428,7 @@ next:
     }
 
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2510,7 +2525,7 @@ next:
 
   if (opt_cmd == OPT_OBJECT_UNLINK) {
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2527,7 +2542,7 @@ next:
 
   if (opt_cmd == OPT_OBJECT_STAT) {
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2587,7 +2602,7 @@ next:
         cerr << "ERROR: need to specify bucket name" << std::endl;
         return EINVAL;
       }
-      do_check_object_locator(bucket_name, fix, remove_bad, formatter);
+      do_check_object_locator(tenant, bucket_name, fix, remove_bad, formatter);
     } else {
       RGWBucketAdminOp::check_index(store, bucket_op, f);
     }
@@ -2697,7 +2712,7 @@ next:
   if (opt_cmd == OPT_USER_STATS) {
     if (sync_stats) {
       if (!bucket_name.empty()) {
-        int ret = rgw_bucket_sync_user_stats(store, bucket_name);
+        int ret = rgw_bucket_sync_user_stats(store, tenant, bucket_name);
         if (ret < 0) {
           cerr << "ERROR: could not sync bucket stats: " << cpp_strerror(-ret) << std::endl;
           return -ret;
@@ -2716,7 +2731,8 @@ next:
       return EINVAL;
     }
     cls_user_header header;
-    int ret = store->cls_user_get_header(user_id, &header);
+    string user_str = user_id.to_str();
+    int ret = store->cls_user_get_header(user_str, &header);
     if (ret < 0) {
       cerr << "ERROR: can't read user header: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2870,7 +2886,7 @@ next:
       return -EINVAL;
     }
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -2910,7 +2926,7 @@ next:
       return -EINVAL;
     }
     RGWBucketInfo bucket_info;
-    int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
     if (ret < 0) {
       cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -3092,7 +3108,7 @@ next:
         return -EINVAL;
       }
       RGWBucketInfo bucket_info;
-      int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+      int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
       if (ret < 0) {
         cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
         return -ret;
@@ -3143,7 +3159,7 @@ next:
         return -EINVAL;
       }
       RGWBucketInfo bucket_info;
-      int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+      int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
       if (ret < 0) {
         cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
         return -ret;
@@ -3205,7 +3221,7 @@ next:
         return -EINVAL;
       }
       RGWBucketInfo bucket_info;
-      int ret = init_bucket(bucket_name, bucket_id, bucket_info, bucket);
+      int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket);
       if (ret < 0) {
         cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
         return -ret;
@@ -3233,7 +3249,8 @@ next:
         cerr << "ERROR: invalid quota scope specification." << std::endl;
         return EINVAL;
       }
-      set_bucket_quota(store, opt_cmd, bucket_name, max_size, max_objects, have_max_size, have_max_objects);
+      set_bucket_quota(store, opt_cmd, tenant, bucket_name,
+                       max_size, max_objects, have_max_size, have_max_objects);
     } else if (!user_id.empty()) {
       if (quota_scope == "bucket") {
         set_user_bucket_quota(opt_cmd, user, user_op, max_size, max_objects, have_max_size, have_max_objects);
diff --git a/src/rgw/rgw_basic_types.cc b/src/rgw/rgw_basic_types.cc
new file mode 100644
index 0000000..5ebf1cf
--- /dev/null
+++ b/src/rgw/rgw_basic_types.cc
@@ -0,0 +1,14 @@
+#include "rgw_basic_types.h"
+#include "common/ceph_json.h"
+
+void decode_json_obj(rgw_user& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  val.from_str(s);
+}
+
+void encode_json(const char *name, const rgw_user& val, Formatter *f)
+{
+  string s = val.to_str();
+  f->dump_string(name, s);
+}
diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h
new file mode 100644
index 0000000..4a9c42c
--- /dev/null
+++ b/src/rgw/rgw_basic_types.h
@@ -0,0 +1,111 @@
+#ifndef CEPH_RGW_BASIC_TYPES_H
+#define CEPH_RGW_BASIC_TYPES_H
+
+#include <string>
+
+#include "include/types.h"
+
+struct rgw_user {
+  std::string tenant;
+  std::string id;
+
+  rgw_user() {}
+  rgw_user(const std::string& s) {
+    from_str(s);
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(tenant, bl);
+    ::encode(id, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(tenant, bl);
+    ::decode(id, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void to_str(std::string& str) const {
+    if (!tenant.empty()) {
+      str = tenant + '$' + id;
+    } else {
+      str = id;
+    }
+  }
+
+  void clear() {
+    tenant.clear();
+    id.clear();
+  }
+
+  bool empty() {
+    return id.empty();
+  }
+
+  string to_str() const {
+    string s;
+    to_str(s);
+    return s;
+  }
+
+  void from_str(const std::string& str) {
+    ssize_t pos = str.find('$');
+    if (pos >= 0) {
+      tenant = str.substr(0, pos);
+      id = str.substr(pos + 1);
+    } else {
+      tenant.clear();
+      id = str;
+    }
+  }
+
+  rgw_user& operator=(const string& str) {
+    from_str(str);
+    return *this;
+  }
+
+  int compare(const rgw_user& u) const {
+    int r = tenant.compare(u.tenant);
+    if (r != 0)
+      return r;
+
+    return id.compare(u.id);
+  }
+  int compare(const string& str) const {
+    rgw_user u(str);
+    return compare(u);
+  }
+
+  bool operator!=(const rgw_user& rhs) const {
+    return (compare(rhs) != 0);
+  }
+  bool operator==(const rgw_user& rhs) const {
+    return (compare(rhs) == 0);
+  }
+  bool operator<(const rgw_user& rhs) const {
+    if (tenant < rhs.tenant) {
+      return true;
+    } else if (tenant > rhs.tenant) {
+      return false;
+    }
+    return (id < rhs.id);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_user)
+
+
+class JSONObj;
+
+void decode_json_obj(rgw_user& val, JSONObj *obj);
+void encode_json(const char *name, const rgw_user& val, Formatter *f);
+
+inline ostream& operator<<(ostream& out, const rgw_user &u) {
+  string s;
+  u.to_str(s);
+  return out << s;
+}
+
+
+#endif
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index b857a5f..f4a245f 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -32,18 +32,60 @@ static RGWMetadataHandler *bucket_meta_handler = NULL;
 static RGWMetadataHandler *bucket_instance_meta_handler = NULL;
 
 // define as static when RGWBucket implementation compete
-void rgw_get_buckets_obj(const string& user_id, string& buckets_obj_id)
+void rgw_get_buckets_obj(const rgw_user& user_id, string& buckets_obj_id)
 {
-  buckets_obj_id = user_id;
+  buckets_obj_id = user_id.to_str();
   buckets_obj_id += RGW_BUCKETS_OBJ_SUFFIX;
 }
 
+/*
+ * Note that this is not a reversal of parse_bucket(). That one deals
+ * with the syntax we need in metadata and such. This one deals with
+ * the representation in RADOS pools. We chose '/' because it's not
+ * acceptable in bucket names and thus qualified buckets cannot conflict
+ * with the legacy or S3 buckets.
+ */
+void rgw_make_bucket_entry_name(const string& tenant_name, const string& bucket_name, string& bucket_entry) {
+  if (tenant_name.empty()) {
+    bucket_entry = bucket_name;
+  } else {
+    bucket_entry = tenant_name + "/" + bucket_name;
+  }
+}
+
+string rgw_make_bucket_entry_name(const string& tenant_name, const string& bucket_name) {
+  string bucket_entry;
+
+  if (tenant_name.empty()) {
+    bucket_entry = bucket_name;
+  } else {
+    bucket_entry = tenant_name + "/" + bucket_name;
+  }
+
+  return bucket_entry;
+}
+
+/*
+ * Tenants are separated from buckets in URLs by a colon in S3.
+ * This function is not to be used on Swift URLs, not even for COPY arguments.
+ */
+void rgw_parse_url_bucket(const string &bucket,
+                          string &tenant_name, string &bucket_name) {
+  int pos = bucket.find(':');
+  if (pos >= 0) {
+    tenant_name = bucket.substr(0, pos);
+  } else {
+    tenant_name.clear();
+  }
+  bucket_name = bucket.substr(pos + 1);
+}
+
 /**
  * Get all the buckets owned by a user and fill up an RGWUserBuckets with them.
  * Returns: 0 on success, -ERR# on failure.
  */
 int rgw_read_user_buckets(RGWRados * store,
-                          string user_id,
+                          const rgw_user& user_id,
                           RGWUserBuckets& buckets,
                           const string& marker,
                           uint64_t max,
@@ -95,7 +137,7 @@ int rgw_read_user_buckets(RGWRados * store,
   return 0;
 }
 
-int rgw_bucket_sync_user_stats(RGWRados *store, const string& user_id, rgw_bucket& bucket)
+int rgw_bucket_sync_user_stats(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket)
 {
   string buckets_obj_id;
   rgw_get_buckets_obj(user_id, buckets_obj_id);
@@ -104,11 +146,11 @@ int rgw_bucket_sync_user_stats(RGWRados *store, const string& user_id, rgw_bucke
   return store->cls_user_sync_bucket_stats(obj, bucket);
 }
 
-int rgw_bucket_sync_user_stats(RGWRados *store, const string& bucket_name)
+int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const string& bucket_name)
 {
   RGWBucketInfo bucket_info;
   RGWObjectCtx obj_ctx(store);
-  int ret = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL);
+  int ret = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL);
   if (ret < 0) {
     ldout(store->ctx(), 0) << "ERROR: could not fetch bucket info: ret=" << ret << dendl;
     return ret;
@@ -123,9 +165,10 @@ int rgw_bucket_sync_user_stats(RGWRados *store, const string& bucket_name)
   return 0;
 }
 
-int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t creation_time, bool update_entrypoint)
+int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, time_t creation_time, bool update_entrypoint)
 {
   int ret;
+  string& tenant_name = bucket.tenant;
   string& bucket_name = bucket.name;
 
   cls_user_bucket_entry new_bucket;
@@ -144,7 +187,7 @@ int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t
   RGWObjectCtx obj_ctx(store);
 
   if (update_entrypoint) {
-    ret = store->get_bucket_entrypoint_info(obj_ctx, bucket_name, ep, &ot, NULL, &attrs);
+    ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, ep, &ot, NULL, &attrs);
     if (ret < 0 && ret != -ENOENT) {
       ldout(store->ctx(), 0) << "ERROR: store->get_bucket_entrypoint_info() returned " << ret << dendl;
     } else if (ret >= 0 && ep.linked && ep.owner != user_id) {
@@ -169,20 +212,20 @@ int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t
 
   ep.linked = true;
   ep.owner = user_id;
-  ret = store->put_bucket_entrypoint_info(bucket_name, ep, false, ot, 0, &attrs);
+  ret = store->put_bucket_entrypoint_info(tenant_name, bucket_name, ep, false, ot, 0, &attrs);
   if (ret < 0)
     goto done_err;
 
   return 0;
 done_err:
-  int r = rgw_unlink_bucket(store, user_id, bucket.name);
+  int r = rgw_unlink_bucket(store, user_id, bucket.tenant, bucket.name);
   if (r < 0) {
     ldout(store->ctx(), 0) << "ERROR: failed unlinking bucket on error cleanup: " << cpp_strerror(-r) << dendl;
   }
   return ret;
 }
 
-int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name, bool update_entrypoint)
+int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id, const string& tenant_name, const string& bucket_name, bool update_entrypoint)
 {
   int ret;
 
@@ -207,7 +250,7 @@ int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name
   RGWObjVersionTracker ot;
   map<string, bufferlist> attrs;
   RGWObjectCtx obj_ctx(store);
-  ret = store->get_bucket_entrypoint_info(obj_ctx, bucket_name, ep, &ot, NULL, &attrs);
+  ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, ep, &ot, NULL, &attrs);
   if (ret == -ENOENT)
     return 0;
   if (ret < 0)
@@ -222,7 +265,7 @@ int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name
   }
 
   ep.linked = false;
-  ret = store->put_bucket_entrypoint_info(bucket_name, ep, false, ot, 0, &attrs);
+  ret = store->put_bucket_entrypoint_info(tenant_name, bucket_name, ep, false, ot, 0, &attrs);
   if (ret < 0)
     return ret;
 
@@ -281,7 +324,7 @@ int rgw_bucket_set_attrs(RGWRados *store, RGWBucketInfo& bucket_info,
   if (!bucket_info.has_instance_obj) {
     /* an old bucket object, need to convert it */
     RGWObjectCtx obj_ctx(store);
-    int ret = store->convert_old_bucket_info(obj_ctx, bucket.name);
+    int ret = store->convert_old_bucket_info(obj_ctx, bucket.tenant, bucket.name);
     if (ret < 0) {
       ldout(store->ctx(), 0) << "ERROR: failed converting old bucket info: " << ret << dendl;
       return ret;
@@ -313,7 +356,7 @@ static void dump_mulipart_index_results(list<rgw_obj_key>& objs_to_unlink,
   f->close_section();
 }
 
-void check_bad_user_bucket_mapping(RGWRados *store, const string& user_id, bool fix)
+void check_bad_user_bucket_mapping(RGWRados *store, const rgw_user& user_id, bool fix)
 {
   RGWUserBuckets user_buckets;
   bool done;
@@ -342,7 +385,7 @@ void check_bad_user_bucket_mapping(RGWRados *store, const string& user_id, bool
       RGWBucketInfo bucket_info;
       time_t mtime;
       RGWObjectCtx obj_ctx(store);
-      int r = store->get_bucket_info(obj_ctx, bucket.name, bucket_info, &mtime);
+      int r = store->get_bucket_info(obj_ctx, user_id.tenant, bucket.name, bucket_info, &mtime);
       if (r < 0) {
         ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl;
         continue;
@@ -351,6 +394,7 @@ void check_bad_user_bucket_mapping(RGWRados *store, const string& user_id, bool
       rgw_bucket& actual_bucket = bucket_info.bucket;
 
       if (actual_bucket.name.compare(bucket.name) != 0 ||
+          actual_bucket.tenant.compare(bucket.tenant) != 0 ||
           actual_bucket.data_pool.compare(bucket.data_pool) != 0 ||
           actual_bucket.index_pool.compare(bucket.index_pool) != 0 ||
           actual_bucket.marker.compare(bucket.marker) != 0 ||
@@ -392,7 +436,7 @@ int rgw_remove_object(RGWRados *store, RGWBucketInfo& bucket_info, rgw_bucket& b
   return ret;
 }
 
-int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& bucket, bool delete_children)
+int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children)
 {
   int ret;
   map<RGWObjCategory, RGWStorageStats> stats;
@@ -411,7 +455,7 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b
 
   obj.bucket = bucket;
 
-  ret = store->get_bucket_info(obj_ctx, bucket.name, info, NULL);
+  ret = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL);
   if (ret < 0)
     return ret;
 
@@ -450,7 +494,7 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b
     return ret;
   }
 
-  ret = rgw_unlink_bucket(store, info.owner, bucket.name);
+  ret = rgw_unlink_bucket(store, info.owner, bucket.tenant, bucket.name, false);
   if (ret < 0) {
     lderr(store->ctx()) << "ERROR: unable to remove user bucket information" << dendl;
   }
@@ -458,9 +502,15 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b
   return ret;
 }
 
-int rgw_bucket_delete_bucket_obj(RGWRados *store, string& bucket_name, RGWObjVersionTracker& objv_tracker)
+int rgw_bucket_delete_bucket_obj(RGWRados *store,
+                                 const string& tenant_name,
+                                 const string& bucket_name,
+                                 RGWObjVersionTracker& objv_tracker)
 {
-  return store->meta_mgr->remove_entry(bucket_meta_handler, bucket_name, &objv_tracker);
+  string key;
+
+  rgw_make_bucket_entry_name(tenant_name, bucket_name, key);
+  return store->meta_mgr->remove_entry(bucket_meta_handler, key, &objv_tracker);
 }
 
 static void set_err_msg(std::string *sink, std::string msg)
@@ -476,7 +526,8 @@ int RGWBucket::init(RGWRados *storage, RGWBucketAdminOpState& op_state)
 
   store = storage;
 
-  string user_id = op_state.get_user_id();
+  rgw_user user_id = op_state.get_user_id();
+  tenant = user_id.tenant;
   bucket_name = op_state.get_bucket_name();
   RGWUserBuckets user_buckets;
   RGWObjectCtx obj_ctx(store);
@@ -485,7 +536,7 @@ int RGWBucket::init(RGWRados *storage, RGWBucketAdminOpState& op_state)
     return -EINVAL;
 
   if (!bucket_name.empty()) {
-    int r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL);
+    int r = store->get_bucket_info(obj_ctx, tenant, bucket_name, bucket_info, NULL);
     if (r < 0) {
       ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket_name << dendl;
       return r;
@@ -551,9 +602,9 @@ int RGWBucket::link(RGWBucketAdminOpState& op_state, std::string *err_msg)
       return -EIO;
     }
 
-    r = rgw_unlink_bucket(store, owner.get_id(), bucket.name);
+    r = rgw_unlink_bucket(store, owner.get_id(), bucket.tenant, bucket.name);
     if (r < 0) {
-      set_err_msg(err_msg, "could not unlink policy from user " + owner.get_id());
+      set_err_msg(err_msg, "could not unlink policy from user " + owner.get_id().to_str());
       return r;
     }
 
@@ -595,7 +646,7 @@ int RGWBucket::unlink(RGWBucketAdminOpState& op_state, std::string *err_msg)
     return -EINVAL;
   }
 
-  int r = rgw_unlink_bucket(store, user_info.user_id, bucket.name);
+  int r = rgw_unlink_bucket(store, user_info.user_id, bucket.tenant, bucket.name);
   if (r < 0) {
     set_err_msg(err_msg, "error unlinking bucket" + cpp_strerror(-r));
   }
@@ -608,7 +659,7 @@ int RGWBucket::remove(RGWBucketAdminOpState& op_state, std::string *err_msg)
   bool delete_children = op_state.will_delete_children();
   rgw_bucket bucket = op_state.get_bucket();
 
-  int ret = rgw_remove_bucket(store, bucket_info.owner, bucket, delete_children);
+  int ret = rgw_remove_bucket(store, bucket, delete_children);
   if (ret < 0) {
     set_err_msg(err_msg, "unable to remove bucket" + cpp_strerror(-ret));
     return ret;
@@ -847,7 +898,7 @@ int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, ostream& o)
 
   RGWBucketInfo bucket_info;
   map<string, bufferlist> attrs;
-  int ret = store->get_bucket_info(obj_ctx, bucket.name, bucket_info, NULL, &attrs);
+  int ret = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL, &attrs);
   if (ret < 0) {
     return ret;
   }
@@ -878,7 +929,7 @@ int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, ostream& o)
 int RGWBucketAdminOp::get_policy(RGWRados *store, RGWBucketAdminOpState& op_state,
                   ostream& os)
 {
-   RGWBucket bucket;
+  RGWBucket bucket;
 
   int ret = bucket.init(store, op_state);
   if (ret < 0)
@@ -1001,7 +1052,7 @@ int RGWBucketAdminOp::remove_object(RGWRados *store, RGWBucketAdminOpState& op_s
   return bucket.remove_object(op_state);
 }
 
-static int bucket_stats(RGWRados *store, std::string&  bucket_name, Formatter *formatter)
+static int bucket_stats(RGWRados *store, const std::string& tenant_name, std::string&  bucket_name, Formatter *formatter)
 {
   RGWBucketInfo bucket_info;
   rgw_bucket bucket;
@@ -1009,7 +1060,7 @@ static int bucket_stats(RGWRados *store, std::string&  bucket_name, Formatter *f
 
   time_t mtime;
   RGWObjectCtx obj_ctx(store);
-  int r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, &mtime);
+  int r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, &mtime);
   if (r < 0)
     return r;
 
@@ -1031,7 +1082,7 @@ static int bucket_stats(RGWRados *store, std::string&  bucket_name, Formatter *f
   formatter->dump_string("index_pool", bucket.index_pool);
   formatter->dump_string("id", bucket.bucket_id);
   formatter->dump_string("marker", bucket.marker);
-  formatter->dump_string("owner", bucket_info.owner);
+  ::encode_json("owner", bucket_info.owner, formatter);
   formatter->dump_string("ver", bucket_ver);
   formatter->dump_string("master_ver", master_ver);
   formatter->dump_stream("mtime") << ut;
@@ -1066,6 +1117,7 @@ int RGWBucketAdminOp::info(RGWRados *store, RGWBucketAdminOpState& op_state,
   size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
 
   bool show_stats = op_state.will_fetch_stats();
+  rgw_user user_id = op_state.get_user_id();
   if (op_state.is_user_op()) {
     formatter->open_array_section("buckets");
 
@@ -1074,7 +1126,7 @@ int RGWBucketAdminOp::info(RGWRados *store, RGWBucketAdminOpState& op_state,
     bool done;
 
     do {
-      ret = rgw_read_user_buckets(store, op_state.get_user_id(), buckets, marker, max_entries, false);
+      ret = rgw_read_user_buckets(store, user_id, buckets, marker, max_entries, false);
       if (ret < 0)
         return ret;
 
@@ -1084,7 +1136,7 @@ int RGWBucketAdminOp::info(RGWRados *store, RGWBucketAdminOpState& op_state,
       for (iter = m.begin(); iter != m.end(); ++iter) {
         std::string  obj_name = iter->first;
         if (show_stats)
-          bucket_stats(store, obj_name, formatter);
+          bucket_stats(store, user_id.tenant, obj_name, formatter);
         else
           formatter->dump_string("bucket", obj_name);
 
@@ -1097,7 +1149,7 @@ int RGWBucketAdminOp::info(RGWRados *store, RGWBucketAdminOpState& op_state,
 
     formatter->close_section();
   } else if (!bucket_name.empty()) {
-    bucket_stats(store, bucket_name, formatter);
+    bucket_stats(store, user_id.tenant, bucket_name, formatter);
   } else {
     RGWAccessHandle handle;
 
@@ -1107,7 +1159,7 @@ int RGWBucketAdminOp::info(RGWRados *store, RGWBucketAdminOpState& op_state,
       while (store->list_buckets_next(obj, &handle) >= 0) {
 	formatter->dump_string("bucket", obj.key.name);
         if (show_stats)
-          bucket_stats(store, obj.key.name, formatter);
+          bucket_stats(store, user_id.tenant, obj.key.name, formatter);
       }
     }
 
@@ -1537,7 +1589,9 @@ public:
     map<string, bufferlist> attrs;
     RGWObjectCtx obj_ctx(store);
 
-    int ret = store->get_bucket_entrypoint_info(obj_ctx, entry, be, &ot, &mtime, &attrs);
+    string tenant_name, bucket_name;
+    parse_bucket(entry, tenant_name, bucket_name);
+    int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, be, &ot, &mtime, &attrs);
     if (ret < 0)
       return ret;
 
@@ -1559,7 +1613,9 @@ public:
     RGWObjVersionTracker old_ot;
     RGWObjectCtx obj_ctx(store);
 
-    int ret = store->get_bucket_entrypoint_info(obj_ctx, entry, old_be, &old_ot, &orig_mtime, &attrs);
+    string tenant_name, bucket_name;
+    parse_bucket(entry, tenant_name, bucket_name);
+    int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, old_be, &old_ot, &orig_mtime, &attrs);
     if (ret < 0 && ret != -ENOENT)
       return ret;
 
@@ -1572,7 +1628,7 @@ public:
 
     objv_tracker.read_version = old_ot.read_version; /* maintain the obj version we just read */
 
-    ret = store->put_bucket_entrypoint_info(entry, be, false, objv_tracker, mtime, &attrs);
+    ret = store->put_bucket_entrypoint_info(tenant_name, bucket_name, be, false, objv_tracker, mtime, &attrs);
     if (ret < 0)
       return ret;
 
@@ -1580,7 +1636,7 @@ public:
     if (be.linked) {
       ret = rgw_link_bucket(store, be.owner, be.bucket, be.creation_time, false);
     } else {
-      ret = rgw_unlink_bucket(store, be.owner, be.bucket.name, false);
+      ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant, be.bucket.name, false);
     }
 
     return ret;
@@ -1595,7 +1651,9 @@ public:
     RGWBucketEntryPoint be;
     RGWObjectCtx obj_ctx(store);
 
-    int ret = store->get_bucket_entrypoint_info(obj_ctx, entry, be, &objv_tracker, NULL, NULL);
+    string tenant_name, bucket_name;
+    parse_bucket(entry, tenant_name, bucket_name);
+    int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, be, &objv_tracker, NULL, NULL);
     if (ret < 0)
       return ret;
 
@@ -1604,12 +1662,12 @@ public:
      * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
      * will incorrectly fail.
      */
-    ret = rgw_unlink_bucket(store, be.owner, entry, false);
+    ret = rgw_unlink_bucket(store, be.owner, tenant_name, bucket_name, false);
     if (ret < 0) {
       lderr(store->ctx()) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
     }
 
-    ret = rgw_bucket_delete_bucket_obj(store, entry, objv_tracker);
+    ret = rgw_bucket_delete_bucket_obj(store, tenant_name, bucket_name, objv_tracker);
     if (ret < 0) {
       lderr(store->ctx()) << "could not delete bucket=" << entry << dendl;
     }
@@ -1695,7 +1753,7 @@ public:
     return 0;
   }
 
-  int put(RGWRados *store, string& oid, RGWObjVersionTracker& objv_tracker,
+  int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker,
           time_t mtime, JSONObj *obj, sync_type_t sync_type) {
     RGWBucketCompleteInfo bci, old_bci;
     decode_json_obj(bci, obj);
@@ -1703,16 +1761,21 @@ public:
     time_t orig_mtime;
     RGWObjectCtx obj_ctx(store);
 
-    int ret = store->get_bucket_instance_info(obj_ctx, oid, old_bci.info, &orig_mtime, &old_bci.attrs);
+    int ret = store->get_bucket_instance_info(obj_ctx, entry, old_bci.info,
+            &orig_mtime, &old_bci.attrs);
     bool exists = (ret != -ENOENT);
     if (ret < 0 && exists)
       return ret;
 
-
     if (!exists || old_bci.info.bucket.bucket_id != bci.info.bucket.bucket_id) {
       /* a new bucket, we need to select a new bucket placement for it */
+      string tenant_name;
+      string bucket_name;
+      parse_bucket(entry, tenant_name, bucket_name);
+
       rgw_bucket bucket;
-      ret = store->set_bucket_location_by_rule(bci.info.placement_rule, oid, bucket);
+      ret = store->set_bucket_location_by_rule(bci.info.placement_rule,
+                                           tenant_name, bucket_name, bucket);
       if (ret < 0) {
         ldout(store->ctx(), 0) << "ERROR: select_bucket_placement() returned " << ret << dendl;
         return ret;
diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h
index 222e152..9191fbe 100644
--- a/src/rgw/rgw_bucket.h
+++ b/src/rgw/rgw_bucket.h
@@ -23,7 +23,7 @@
 using namespace std;
 
 // define as static when RGWBucket implementation compete
-extern void rgw_get_buckets_obj(const string& user_id, string& buckets_obj_id);
+extern void rgw_get_buckets_obj(const rgw_user& user_id, string& buckets_obj_id);
 
 extern int rgw_bucket_store_info(RGWRados *store, const string& bucket_name, bufferlist& bl, bool exclusive,
                                  map<string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker,
@@ -36,10 +36,21 @@ extern int rgw_bucket_parse_bucket_instance(const string& bucket_instance, strin
 
 extern int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker);
 
-extern int rgw_bucket_delete_bucket_obj(RGWRados *store, string& bucket_name, RGWObjVersionTracker& objv_tracker);
+extern int rgw_bucket_delete_bucket_obj(RGWRados *store,
+                                        const string& tenant_name,
+                                        const string& bucket_name,
+                                        RGWObjVersionTracker& objv_tracker);
 
-extern int rgw_bucket_sync_user_stats(RGWRados *store, const string& user_id, rgw_bucket& bucket);
-extern int rgw_bucket_sync_user_stats(RGWRados *store, const string& bucket_name);
+extern int rgw_bucket_sync_user_stats(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket);
+extern int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const string& bucket_name);
+
+extern void rgw_make_bucket_entry_name(const string& tenant_name,
+                                       const string& bucket_name,
+                                       string& bucket_entry);
+extern string rgw_make_bucket_entry_name(const string& tenant_name,
+                                       const string& bucket_name);
+extern void rgw_parse_url_bucket(const string &bucket,
+                                 string &tenant_name, string &bucket_name);
 
 /**
  * Store a list of the user's buckets, with associated functinos.
@@ -105,28 +116,29 @@ extern void rgw_bucket_init(RGWMetadataManager *mm);
  * Returns: 0 on success, -ERR# on failure.
  */
 extern int rgw_read_user_buckets(RGWRados *store,
-                                 string user_id,
+                                 const rgw_user& user_id,
                                  RGWUserBuckets& buckets,
                                  const string& marker,
                                  uint64_t max,
                                  bool need_stats,
                                  uint64_t default_amount = 1000);
 
-extern int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t creation_time, bool update_entrypoint = true);
-extern int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name, bool update_entrypoint = true);
+extern int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, time_t creation_time, bool update_entrypoint = true);
+extern int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id,
+                             const string& tenant_name, const string& bucket_name, bool update_entrypoint = true);
 
 extern int rgw_remove_object(RGWRados *store, RGWBucketInfo& bucket_info, rgw_bucket& bucket, rgw_obj_key& key);
-extern int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& bucket, bool delete_children);
+extern int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children);
 
 extern int rgw_bucket_set_attrs(RGWRados *store, RGWBucketInfo& bucket_info,
                                 map<string, bufferlist>& attrs,
                                 map<string, bufferlist>* rmattrs,
                                 RGWObjVersionTracker *objv_tracker);
 
-extern void check_bad_user_bucket_mapping(RGWRados *store, const string& user_id, bool fix);
+extern void check_bad_user_bucket_mapping(RGWRados *store, const rgw_user& user_id, bool fix);
 
 struct RGWBucketAdminOpState {
-  std::string uid;
+  rgw_user uid;
   std::string display_name;
   std::string bucket_name;
   std::string bucket_id;
@@ -146,7 +158,7 @@ struct RGWBucketAdminOpState {
   void set_fix_index(bool value) { fix_index = value; }
   void set_delete_children(bool value) { delete_child_objects = value; }
 
-  void set_user_id(std::string& user_id) {
+  void set_user_id(rgw_user& user_id) {
     if (!user_id.empty())
       uid = user_id;
   }
@@ -157,7 +169,7 @@ struct RGWBucketAdminOpState {
     object_name = object_str;
   }
 
-  std::string& get_user_id() { return uid; }
+  rgw_user& get_user_id() { return uid; }
   std::string& get_user_display_name() { return display_name; }
   std::string& get_bucket_name() { return bucket_name; }
   std::string& get_object_name() { return object_name; }
@@ -197,14 +209,13 @@ class RGWBucket
   RGWAccessHandle handle;
 
   RGWUserInfo user_info;
+  std::string tenant;
   std::string bucket_name;
 
   bool failure;
 
   RGWBucketInfo bucket_info;
 
-private:
-
 public:
   RGWBucket() : store(NULL), handle(NULL), failure(false) {}
   int init(RGWRados *storage, RGWBucketAdminOpState& op_state);
diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h
index dc90db3..1191d0c 100644
--- a/src/rgw/rgw_client_io.h
+++ b/src/rgw/rgw_client_io.h
@@ -4,6 +4,8 @@
 #ifndef CEPH_RGW_CLIENT_IO_H
 #define CEPH_RGW_CLIENT_IO_H
 
+#include <streambuf>
+#include <istream>
 #include <stdlib.h>
 
 #include "include/types.h"
@@ -50,4 +52,65 @@ public:
   uint64_t get_bytes_received() { return bytes_received; }
 };
 
+
+class RGWClientIOStreamBuf : public std::streambuf {
+protected:
+  RGWClientIO &cio;
+  std::size_t const window_size;
+  std::size_t const putback_size;
+  std::vector<char> buffer;
+
+public:
+  RGWClientIOStreamBuf(RGWClientIO &c, std::size_t ws, std::size_t ps = 1)
+    : cio(c),
+      window_size(ws),
+      putback_size(ps),
+      buffer(ws + ps)
+  {
+    setg(nullptr, nullptr, nullptr);
+  }
+
+  std::streambuf::int_type underflow() {
+    if (gptr() < egptr()) {
+      return traits_type::to_int_type(*gptr());
+    }
+
+    char * const base = buffer.data();
+    char * start;
+
+    if (nullptr != eback()) {
+      /* We need to skip moving bytes on first underflow. In such case
+       * there is simply no previous data we should preserve for unget()
+       * or something similar. */
+      std::memmove(base, egptr() - putback_size, putback_size);
+      start = base + putback_size;
+    } else {
+      start = base;
+    }
+
+    int read_len;
+    int ret = cio.read(base, window_size, &read_len);
+    if (ret < 0 || 0 == read_len) {
+      return traits_type::eof();
+    }
+
+    setg(base, start, start + read_len);
+
+    return traits_type::to_int_type(*gptr());
+  }
+};
+
+
+class RGWClientIOStream : private RGWClientIOStreamBuf, public std::istream {
+/* Inheritance from RGWClientIOStreamBuf is a kind of shadow, undirect
+ * form of composition here. We cannot do that explicitly because istream
+ * ctor is being called prior to construction of any member of this class. */
+
+public:
+  RGWClientIOStream(RGWClientIO &c)
+    : RGWClientIOStreamBuf(c, 1, 2),
+      istream(static_cast<RGWClientIOStreamBuf *>(this)) {
+  }
+};
+
 #endif
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index ae045f4..9e20fa3 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -171,7 +171,6 @@ req_state::req_state(CephContext *_cct, class RGWEnv *e) : cct(_cct), cio(NULL),
   bucket_exists = false;
   has_bad_meta = false;
   length = NULL;
-  copy_source = NULL;
   http_auth = NULL;
   local_source = false;
 
@@ -728,9 +727,11 @@ bool verify_requester_payer_permission(struct req_state *s)
   return false;
 }
 
-bool verify_bucket_permission(struct req_state *s, int perm)
+bool verify_bucket_permission(struct req_state * const s,
+                              RGWAccessControlPolicy * const bucket_acl,
+                              const int perm)
 {
-  if (!s->bucket_acl)
+  if (!bucket_acl)
     return false;
 
   if ((perm & (int)s->perm_mask) != perm)
@@ -739,21 +740,33 @@ bool verify_bucket_permission(struct req_state *s, int perm)
   if (!verify_requester_payer_permission(s))
     return false;
 
-  return s->bucket_acl->verify_permission(s->user.user_id, perm, perm);
+  return bucket_acl->verify_permission(s->user.user_id, perm, perm);
 }
 
-static inline bool check_deferred_bucket_acl(struct req_state *s, uint8_t deferred_check, int perm)
+bool verify_bucket_permission(struct req_state * const s, const int perm)
 {
-  return (s->defer_to_bucket_acls == deferred_check && verify_bucket_permission(s, perm));
+  return verify_bucket_permission(s, s->bucket_acl, perm);
 }
 
-bool verify_object_permission(struct req_state *s, RGWAccessControlPolicy *bucket_acl, RGWAccessControlPolicy *object_acl, int perm)
+static inline bool check_deferred_bucket_acl(struct req_state * const s,
+                                             RGWAccessControlPolicy * const bucket_acl,
+                                             const uint8_t deferred_check,
+                                             const int perm)
 {
   if (!verify_requester_payer_permission(s))
     return false;
 
-  if (check_deferred_bucket_acl(s, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) ||
-      check_deferred_bucket_acl(s, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) {
+  return (s->defer_to_bucket_acls == deferred_check \
+              && verify_bucket_permission(s, bucket_acl, perm));
+}
+
+bool verify_object_permission(struct req_state * const s,
+                              RGWAccessControlPolicy * const bucket_acl,
+                              RGWAccessControlPolicy * const object_acl,
+                              const int perm)
+{
+  if (check_deferred_bucket_acl(s, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) ||
+      check_deferred_bucket_acl(s, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) {
     return true;
   }
 
@@ -815,7 +828,7 @@ static char hex_to_num(char c)
   return hex_table.to_num(c);
 }
 
-bool url_decode(string& src_str, string& dest_str, bool in_query)
+bool url_decode(const string& src_str, string& dest_str, bool in_query)
 {
   const char *src = src_str.c_str();
   char dest[src_str.size() + 1];
@@ -996,7 +1009,7 @@ int RGWUserCaps::get_cap(const string& cap, string& type, uint32_t *pperm)
     trim_whitespace(cap.substr(0, pos), type);
   }
 
-  if (type.size() == 0)
+  if (!is_valid_cap_type(type))
     return -EINVAL;
 
   string cap_perm;
@@ -1156,6 +1169,27 @@ int RGWUserCaps::check_cap(const string& cap, uint32_t perm)
   return 0;
 }
 
+bool RGWUserCaps::is_valid_cap_type(const string& tp)
+{
+  static const char *cap_type[] = { "user",
+                                    "users",
+                                    "buckets",
+                                    "metadata",
+                                    "usage",
+                                    "zone",
+                                    "bilog",
+                                    "mdlog",
+                                    "datalog",
+                                    "opstate" };
+
+  for (unsigned int i = 0; i < sizeof(cap_type) / sizeof(char *); ++i) {
+    if (tp.compare(cap_type[i]) == 0) {
+      return true;
+    }
+  }
+
+  return false;
+}
 
 static struct rgw_name_to_flag op_type_mapping[] = { {"*",  RGW_OP_TYPE_ALL},
                   {"read",  RGW_OP_TYPE_READ},
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 77bf2e8..c40f0b4 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -28,6 +28,7 @@
 #include "include/types.h"
 #include "include/utime.h"
 #include "rgw_acl.h"
+#include "rgw_basic_types.h"
 #include "rgw_cors.h"
 #include "rgw_quota.h"
 #include "rgw_string.h"
@@ -70,6 +71,10 @@ using ceph::crypto::MD5;
 #define RGW_ATTR_SHADOW_OBJ    	RGW_ATTR_PREFIX "shadow_name"
 #define RGW_ATTR_MANIFEST    	RGW_ATTR_PREFIX "manifest"
 #define RGW_ATTR_USER_MANIFEST  RGW_ATTR_PREFIX "user_manifest"
+#define RGW_ATTR_SLO_MANIFEST   RGW_ATTR_PREFIX "slo_manifest"
+/* Information whether an object is SLO or not must be exposed to
+ * user through custom HTTP header named X-Static-Large-Object. */
+#define RGW_ATTR_SLO_UINDICATOR RGW_ATTR_META_PREFIX "static-large-object"
 
 #define RGW_ATTR_TEMPURL_KEY1   RGW_ATTR_META_PREFIX "temp-url-key"
 #define RGW_ATTR_TEMPURL_KEY2   RGW_ATTR_META_PREFIX "temp-url-key-2"
@@ -150,6 +155,7 @@ using ceph::crypto::MD5;
 #define ERR_INVALID_ACCESS_KEY   2028
 #define ERR_MALFORMED_XML        2029
 #define ERR_USER_EXIST           2030
+#define ERR_NOT_SLO_MANIFEST     2031
 #define ERR_USER_SUSPENDED       2100
 #define ERR_INTERNAL_ERROR       2200
 #define ERR_NOT_IMPLEMENTED      2201
@@ -431,6 +437,7 @@ public:
      DECODE_FINISH(bl);
   }
   int check_cap(const string& cap, uint32_t perm);
+  bool is_valid_cap_type(const string& tp);
   void dump(Formatter *f) const;
   void dump(Formatter *f, const char *name) const;
 
@@ -446,7 +453,7 @@ void decode_json_obj(obj_version& v, JSONObj *obj);
 struct RGWUserInfo
 {
   uint64_t auid;
-  string user_id;
+  rgw_user user_id;
   string display_name;
   string user_email;
   map<string, RGWAccessKey> access_keys;
@@ -466,7 +473,7 @@ struct RGWUserInfo
   RGWUserInfo() : auid(0), suspended(0), max_buckets(RGW_DEFAULT_MAX_BUCKETS), op_mask(RGW_OP_TYPE_ALL), system(0) {}
 
   void encode(bufferlist& bl) const {
-     ENCODE_START(16, 9, bl);
+     ENCODE_START(17, 9, bl);
      ::encode(auid, bl);
      string access_key;
      string secret_key;
@@ -490,7 +497,7 @@ struct RGWUserInfo
      }
      ::encode(swift_name, bl);
      ::encode(swift_key, bl);
-     ::encode(user_id, bl);
+     ::encode(user_id.id, bl);
      ::encode(access_keys, bl);
      ::encode(subusers, bl);
      ::encode(suspended, bl);
@@ -504,10 +511,11 @@ struct RGWUserInfo
      ::encode(bucket_quota, bl);
      ::encode(temp_url_keys, bl);
      ::encode(user_quota, bl);
+     ::encode(user_id.tenant, bl);
      ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-     DECODE_START_LEGACY_COMPAT_LEN_32(16, 9, 9, bl);
+     DECODE_START_LEGACY_COMPAT_LEN_32(17, 9, 9, bl);
      if (struct_v >= 2) ::decode(auid, bl);
      else auid = CEPH_AUTH_UID_DEFAULT;
      string access_key;
@@ -522,14 +530,15 @@ struct RGWUserInfo
     }
     ::decode(display_name, bl);
     ::decode(user_email, bl);
+    /* We populate swift_keys map later nowadays, but we have to decode. */
     string swift_name;
     string swift_key;
     if (struct_v >= 3) ::decode(swift_name, bl);
     if (struct_v >= 4) ::decode(swift_key, bl);
     if (struct_v >= 5)
-      ::decode(user_id, bl);
+      ::decode(user_id.id, bl);
     else
-      user_id = access_key;
+      user_id.id = access_key;
     if (struct_v >= 6) {
       ::decode(access_keys, bl);
       ::decode(subusers, bl);
@@ -554,7 +563,6 @@ struct RGWUserInfo
     } else {
       op_mask = RGW_OP_TYPE_ALL;
     }
-    system = 0;
     if (struct_v >= 13) {
       ::decode(system, bl);
       ::decode(default_placement, bl);
@@ -569,25 +577,22 @@ struct RGWUserInfo
     if (struct_v >= 16) {
       ::decode(user_quota, bl);
     }
+    if (struct_v >= 17) {
+      ::decode(user_id.tenant, bl);
+    } else {
+      user_id.tenant.clear();
+    }
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
   static void generate_test_instances(list<RGWUserInfo*>& o);
 
   void decode_json(JSONObj *obj);
-
-  void clear() {
-    user_id.clear();
-    display_name.clear();
-    user_email.clear();
-    auid = CEPH_AUTH_UID_DEFAULT;
-    access_keys.clear();
-    suspended = 0;
-  }
 };
 WRITE_CLASS_ENCODER(RGWUserInfo)
 
 struct rgw_bucket {
+  std::string tenant;
   std::string name;
   std::string data_pool;
   std::string data_extra_pool; /* if not set, then we should use data_pool instead */
@@ -613,8 +618,8 @@ struct rgw_bucket {
     data_pool = index_pool = n;
     marker = "";
   }
-  rgw_bucket(const char *n, const char *dp, const char *ip, const char *m, const char *id, const char *h) :
-    name(n), data_pool(dp), index_pool(ip), marker(m), bucket_id(id) {}
+  rgw_bucket(const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id, const char *h) :
+    tenant(t), name(n), data_pool(dp), index_pool(ip), marker(m), bucket_id(id) {}
 
   void convert(cls_user_bucket *b) {
     b->name = name;
@@ -626,17 +631,18 @@ struct rgw_bucket {
   }
 
   void encode(bufferlist& bl) const {
-     ENCODE_START(7, 3, bl);
+     ENCODE_START(8, 3, bl);
     ::encode(name, bl);
     ::encode(data_pool, bl);
     ::encode(marker, bl);
     ::encode(bucket_id, bl);
     ::encode(index_pool, bl);
     ::encode(data_extra_pool, bl);
+    ::encode(tenant, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(8, 3, 3, bl);
     ::decode(name, bl);
     ::decode(data_pool, bl);
     if (struct_v >= 2) {
@@ -659,6 +665,9 @@ struct rgw_bucket {
     if (struct_v >= 7) {
       ::decode(data_extra_pool, bl);
     }
+    if (struct_v >= 8) {
+      ::decode(tenant, bl);
+    }
     DECODE_FINISH(bl);
   }
 
@@ -771,7 +780,7 @@ struct RGWBucketInfo
   };
 
   rgw_bucket bucket;
-  string owner;
+  rgw_user owner;
   uint32_t flags;
   string region;
   time_t creation_time;
@@ -796,9 +805,9 @@ struct RGWBucketInfo
   bool requester_pays;
 
   void encode(bufferlist& bl) const {
-     ENCODE_START(12, 4, bl);
+     ENCODE_START(13, 4, bl);
      ::encode(bucket, bl);
-     ::encode(owner, bl);
+     ::encode(owner.id, bl);
      ::encode(flags, bl);
      ::encode(region, bl);
      uint64_t ct = (uint64_t)creation_time;
@@ -809,13 +818,17 @@ struct RGWBucketInfo
      ::encode(num_shards, bl);
      ::encode(bucket_index_shard_hash_type, bl);
      ::encode(requester_pays, bl);
+     ::encode(owner.tenant, bl);
      ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN_32(9, 4, 4, bl);
+    DECODE_START_LEGACY_COMPAT_LEN_32(13, 4, 4, bl);
      ::decode(bucket, bl);
-     if (struct_v >= 2)
-       ::decode(owner, bl);
+     if (struct_v >= 2) {
+       string s;
+       ::decode(s, bl);
+       owner.from_str(s);
+     }
      if (struct_v >= 3)
        ::decode(flags, bl);
      if (struct_v >= 5)
@@ -837,6 +850,8 @@ struct RGWBucketInfo
        ::decode(bucket_index_shard_hash_type, bl);
      if (struct_v >= 12)
        ::decode(requester_pays, bl);
+     if (struct_v >= 13)
+       ::decode(owner.tenant, bl);
      DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
@@ -855,7 +870,7 @@ WRITE_CLASS_ENCODER(RGWBucketInfo)
 struct RGWBucketEntryPoint
 {
   rgw_bucket bucket;
-  string owner;
+  rgw_user owner;
   time_t creation_time;
   bool linked;
 
@@ -865,17 +880,18 @@ struct RGWBucketEntryPoint
   RGWBucketEntryPoint() : creation_time(0), linked(false), has_bucket_info(false) {}
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(8, 8, bl);
+    ENCODE_START(9, 8, bl);
     ::encode(bucket, bl);
-    ::encode(owner, bl);
+    ::encode(owner.id, bl);
     ::encode(linked, bl);
     uint64_t ctime = (uint64_t)creation_time;
     ::encode(ctime, bl);
+    ::encode(owner, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
     bufferlist::iterator orig_iter = bl;
-    DECODE_START_LEGACY_COMPAT_LEN_32(8, 4, 4, bl);
+    DECODE_START_LEGACY_COMPAT_LEN_32(9, 4, 4, bl);
     if (struct_v < 8) {
       /* ouch, old entry, contains the bucket info itself */
       old_bucket_info.decode(orig_iter);
@@ -884,11 +900,14 @@ struct RGWBucketEntryPoint
     }
     has_bucket_info = false;
     ::decode(bucket, bl);
-    ::decode(owner, bl);
+    ::decode(owner.id, bl);
     ::decode(linked, bl);
     uint64_t ctime;
     ::decode(ctime, bl);
     creation_time = (uint64_t)ctime;
+    if (struct_v >= 9) {
+      ::decode(owner, bl);
+    }
     DECODE_FINISH(bl);
   }
 
@@ -969,7 +988,7 @@ struct rgw_obj_key {
     instance = i;
   }
 
-  bool empty() {
+  bool empty() const {
     return name.empty();
   }
   bool operator==(const rgw_obj_key& k) const {
@@ -1033,9 +1052,13 @@ struct req_state {
    uint32_t perm_mask;
    utime_t header_time;
 
+   /* Set once when req_state is initialized and not violated thereafter */
+   string bucket_tenant;
+   string bucket_name;
+
    rgw_bucket bucket;
-   string bucket_name_str;
    rgw_obj_key object;
+   string src_tenant_name;
    string src_bucket_name;
    rgw_obj_key src_object;
    ACLOwner bucket_owner;
@@ -1058,7 +1081,6 @@ struct req_state {
 
    string canned_acl;
    bool has_acl_header;
-   const char *copy_source;
    const char *http_auth;
    bool local_source; /* source is local */
 
@@ -1088,7 +1110,7 @@ struct req_state {
 struct RGWObjEnt {
   rgw_obj_key key;
   std::string ns;
-  std::string owner;
+  rgw_user owner;
   std::string owner_display_name;
   uint64_t size;
   utime_t mtime;
@@ -1634,12 +1656,18 @@ extern string rgw_trim_quotes(const string& val);
 
 /** Check if the req_state's user has the necessary permissions
  * to do the requested action */
+extern bool verify_bucket_permission(struct req_state * s,
+                                     RGWAccessControlPolicy * bucket_acl,
+                                     int perm);
 extern bool verify_bucket_permission(struct req_state *s, int perm);
-extern bool verify_object_permission(struct req_state *s, RGWAccessControlPolicy *bucket_acl, RGWAccessControlPolicy *object_acl, int perm);
+extern bool verify_object_permission(struct req_state *s,
+                                     RGWAccessControlPolicy *bucket_acl,
+                                     RGWAccessControlPolicy *object_acl,
+                                     int perm);
 extern bool verify_object_permission(struct req_state *s, int perm);
 /** Convert an input URL into a sane object name
  * by converting %-escaped strings into characters, etc*/
-extern bool url_decode(string& src_str, string& dest_str, bool in_query = false);
+extern bool url_decode(const string& src_str, string& dest_str, bool in_query = false);
 extern void url_encode(const string& src, string& dst);
 
 extern void calc_hmac_sha1(const char *key, int key_len,
diff --git a/src/rgw/rgw_dencoder.cc b/src/rgw/rgw_dencoder.cc
index 82e3295..36fd0cc 100644
--- a/src/rgw/rgw_dencoder.cc
+++ b/src/rgw/rgw_dencoder.cc
@@ -17,7 +17,7 @@ void RGWObjManifestPart::generate_test_instances(std::list<RGWObjManifestPart*>&
   o.push_back(new RGWObjManifestPart);
 
   RGWObjManifestPart *p = new RGWObjManifestPart;
-  rgw_bucket b("bucket", ".pool", ".index_pool", "marker_", "12", "region");
+  rgw_bucket b("tenant", "bucket", ".pool", ".index_pool", "marker_", "12", "region");
   p->loc = rgw_obj(b, "object");
   p->loc_ofs = 512 * 1024;
   p->size = 128 * 1024;
@@ -133,7 +133,7 @@ void RGWObjManifest::generate_test_instances(std::list<RGWObjManifest*>& o)
   RGWObjManifest *m = new RGWObjManifest;
   for (int i = 0; i<10; i++) {
     RGWObjManifestPart p;
-    rgw_bucket b("bucket", ".pool", ".index_pool", "marker_", "12", "region");
+    rgw_bucket b("tenant", "bucket", ".pool", ".index_pool", "marker_", "12", "region");
     p.loc = rgw_obj(b, "object");
     p.loc_ofs = 0;
     p.size = 512 * 1024;
@@ -257,8 +257,8 @@ ACLGroupTypeEnum ACLGrant_S3::uri_to_group(string& uri)
 
 void ACLGrant::generate_test_instances(list<ACLGrant*>& o)
 {
-  string id, name, email;
-  id = "rgw";
+  rgw_user id("rgw");
+  string name, email;
   name = "Mr. RGW";
   email = "r at gw";
 
@@ -314,7 +314,7 @@ void RGWAccessControlPolicy::generate_test_instances(list<RGWAccessControlPolicy
     p->acl = *l;
 
     string name = "radosgw";
-    string id = "rgw";
+    rgw_user id("rgw");
     p->owner.set_name(name);
     p->owner.set_id(id);
 
@@ -404,7 +404,7 @@ void RGWUserInfo::generate_test_instances(list<RGWUserInfo*>& o)
 
 void rgw_bucket::generate_test_instances(list<rgw_bucket*>& o)
 {
-  rgw_bucket *b = new rgw_bucket("name", "pool", ".index_pool", "marker", "123", "region");
+  rgw_bucket *b = new rgw_bucket("tenant", "name", "pool", ".index_pool", "marker", "123", "region");
   o.push_back(b);
   o.push_back(new rgw_bucket);
 }
@@ -412,7 +412,7 @@ void rgw_bucket::generate_test_instances(list<rgw_bucket*>& o)
 void RGWBucketInfo::generate_test_instances(list<RGWBucketInfo*>& o)
 {
   RGWBucketInfo *i = new RGWBucketInfo;
-  i->bucket = rgw_bucket("bucket", "pool", ".index_pool", "marker", "10", "region");
+  i->bucket = rgw_bucket("tenant", "bucket", "pool", ".index_pool", "marker", "10", "region");
   i->owner = "owner";
   i->flags = BUCKET_SUSPENDED;
   o.push_back(i);
@@ -450,7 +450,7 @@ void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
 void RGWBucketEnt::generate_test_instances(list<RGWBucketEnt*>& o)
 {
   RGWBucketEnt *e = new RGWBucketEnt;
-  e->bucket = rgw_bucket("bucket", "pool", ".index_pool", "marker", "10", "region");
+  e->bucket = rgw_bucket("tenant", "bucket", "pool", ".index_pool", "marker", "10", "region");
   e->size = 1024;
   e->size_rounded = 4096;
   e->count = 1;
@@ -470,7 +470,7 @@ void RGWUploadPartInfo::generate_test_instances(list<RGWUploadPartInfo*>& o)
 
 void rgw_obj::generate_test_instances(list<rgw_obj*>& o)
 {
-  rgw_bucket b = rgw_bucket("bucket", "pool", ".index_pool", "marker", "10", "region");
+  rgw_bucket b = rgw_bucket("tenant", "bucket", "pool", ".index_pool", "marker", "10", "region");
   rgw_obj *obj = new rgw_obj(b, "object");
   o.push_back(obj);
   o.push_back(new rgw_obj);
diff --git a/src/rgw/rgw_formats.cc b/src/rgw/rgw_formats.cc
index d7e47b0..698ec96 100644
--- a/src/rgw/rgw_formats.cc
+++ b/src/rgw/rgw_formats.cc
@@ -21,8 +21,12 @@
 
 #define dout_subsys ceph_subsys_rgw
 
-RGWFormatter_Plain::RGWFormatter_Plain()
-  : buf(NULL), len(0), max_len(0), min_stack_level(0)
+RGWFormatter_Plain::RGWFormatter_Plain(const bool ukv)
+  : buf(NULL),
+    len(0),
+    max_len(0),
+    min_stack_level(0),
+    use_kv(ukv)
 {
 }
 
@@ -64,6 +68,14 @@ void RGWFormatter_Plain::open_array_section(const char *name)
   struct plain_stack_entry new_entry;
   new_entry.is_array = true;
   new_entry.size = 0;
+
+  if (use_kv && min_stack_level > 0 && !stack.empty()) {
+    struct plain_stack_entry& entry = stack.back();
+
+    if (!entry.is_array)
+      dump_format(name, "");
+  }
+
   stack.push_back(new_entry);
 }
 
@@ -79,6 +91,10 @@ void RGWFormatter_Plain::open_object_section(const char *name)
   struct plain_stack_entry new_entry;
   new_entry.is_array = false;
   new_entry.size = 0;
+
+  if (use_kv && min_stack_level > 0)
+    dump_format(name, "");
+
   stack.push_back(new_entry);
 }
 
@@ -124,14 +140,13 @@ std::ostream& RGWFormatter_Plain::dump_stream(const char *name)
 void RGWFormatter_Plain::dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap)
 {
   char buf[LARGE_SIZE];
-  const char *format;
 
   struct plain_stack_entry& entry = stack.back();
 
   if (!min_stack_level)
     min_stack_level = stack.size();
 
-  bool should_print = (stack.size() == min_stack_level && !entry.size);
+  bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv);
 
   entry.size++;
 
@@ -139,12 +154,20 @@ void RGWFormatter_Plain::dump_format_va(const char *name, const char *ns, bool q
     return;
 
   vsnprintf(buf, LARGE_SIZE, fmt, ap);
-  if (len)
-    format = "\n%s";
-  else
-    format = "%s";
 
-  write_data(format, buf);
+  const char *eol;
+  if (len) {
+    if (use_kv && entry.is_array && entry.size > 1)
+      eol = ", ";
+    else
+      eol = "\n";
+  } else
+    eol = "";
+
+  if (use_kv && !entry.is_array)
+    write_data("%s%s: %s", eol, name, buf);
+  else
+    write_data("%s%s", eol, buf);
 }
 
 int RGWFormatter_Plain::get_len() const
@@ -233,7 +256,7 @@ void RGWFormatter_Plain::dump_value_int(const char *name, const char *fmt, ...)
     min_stack_level = stack.size();
 
   struct plain_stack_entry& entry = stack.back();
-  bool should_print = (stack.size() == min_stack_level && !entry.size);
+  bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv);
 
   entry.size++;
 
@@ -250,5 +273,9 @@ void RGWFormatter_Plain::dump_value_int(const char *name, const char *fmt, ...)
   else
     eol = "";
 
-  write_data("%s%s", eol, buf);
+  if (use_kv && !entry.is_array)
+    write_data("%s%s: %s", eol, name, buf);
+  else
+    write_data("%s%s", eol, buf);
+
 }
diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h
index 6f7925e..43c087d 100644
--- a/src/rgw/rgw_formats.h
+++ b/src/rgw/rgw_formats.h
@@ -22,7 +22,7 @@ struct plain_stack_entry {
 class RGWFormatter_Plain : public Formatter {
   void reset_buf();
 public:
-  RGWFormatter_Plain();
+  RGWFormatter_Plain(bool use_kv = false);
   virtual ~RGWFormatter_Plain();
 
   virtual void flush(ostream& os);
@@ -52,6 +52,7 @@ private:
 
   std::list<struct plain_stack_entry> stack;
   size_t min_stack_level;
+  bool use_kv;
 };
 
 class RGWFormatterFlusher {
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h
index aaa5312..7750d13 100644
--- a/src/rgw/rgw_http_errors.h
+++ b/src/rgw/rgw_http_errors.h
@@ -66,6 +66,7 @@ const static struct rgw_http_errors RGW_HTTP_SWIFT_ERRORS[] = {
     { ERR_USER_SUSPENDED, 401, "UserSuspended" },
     { ERR_INVALID_UTF8, 412, "Invalid UTF8" },
     { ERR_BAD_URL, 412, "Bad URL" },
+    { ERR_NOT_SLO_MANIFEST, 400, "Not an SLO manifest" }
 };
 
 struct rgw_http_status_code {
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index 75b0bed..e18bbb6 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -9,6 +9,8 @@
 #include "rgw_cache.h"
 #include "rgw_bucket.h"
 #include "rgw_keystone.h"
+#include "rgw_basic_types.h"
+#include "rgw_op.h"
 
 #include "common/ceph_json.h"
 #include "common/Formatter.h"
@@ -84,8 +86,8 @@ void RGWObjManifest::dump(Formatter *f) const
 
 void rgw_log_entry::dump(Formatter *f) const
 {
-  f->dump_string("object_owner", object_owner);
-  f->dump_string("bucket_owner", bucket_owner);
+  f->dump_string("object_owner", object_owner.to_str());
+  f->dump_string("bucket_owner", bucket_owner.to_str());
   f->dump_string("bucket", bucket);
   f->dump_stream("time") << time;
   f->dump_string("remote_addr", remote_addr);
@@ -122,7 +124,7 @@ void ACLGrant::dump(Formatter *f) const
   type.dump(f);
   f->close_section();
 
-  f->dump_string("id", id);
+  f->dump_string("id", id.to_str());
   f->dump_string("email", email);
 
   f->open_object_section("permission");
@@ -170,7 +172,7 @@ void RGWAccessControlList::dump(Formatter *f) const
 
 void ACLOwner::dump(Formatter *f) const
 {
-  encode_json("id", id, f);
+  encode_json("id", id.to_str(), f);
   encode_json("display_name", display_name, f);
 }
 
@@ -375,25 +377,25 @@ void RGWSubUser::decode_json(JSONObj *obj)
 static void user_info_dump_subuser(const char *name, const RGWSubUser& subuser, Formatter *f, void *parent)
 {
   RGWUserInfo *info = static_cast<RGWUserInfo *>(parent);
-  subuser.dump(f, info->user_id);
+  subuser.dump(f, info->user_id.to_str());
 }
 
 static void user_info_dump_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent)
 {
   RGWUserInfo *info = static_cast<RGWUserInfo *>(parent);
-  key.dump(f, info->user_id, false);
+  key.dump(f, info->user_id.to_str(), false);
 }
 
 static void user_info_dump_swift_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent)
 {
   RGWUserInfo *info = static_cast<RGWUserInfo *>(parent);
-  key.dump(f, info->user_id, true);
+  key.dump(f, info->user_id.to_str(), true);
 }
 
 void RGWUserInfo::dump(Formatter *f) const
 {
 
-  encode_json("user_id", user_id, f);
+  encode_json("user_id", user_id.to_str(), f);
   encode_json("display_name", display_name, f);
   encode_json("email", user_email, f);
   encode_json("suspended", (int)suspended, f);
@@ -445,7 +447,11 @@ static void decode_subusers(map<string, RGWSubUser>& m, JSONObj *o)
 
 void RGWUserInfo::decode_json(JSONObj *obj)
 {
-  JSONDecoder::decode_json("user_id", user_id, obj, true);
+  string uid;
+
+  JSONDecoder::decode_json("user_id", uid, obj, true);
+  user_id.from_str(uid);
+
   JSONDecoder::decode_json("display_name", display_name, obj);
   JSONDecoder::decode_json("email", user_email, obj);
   bool susp = false;
@@ -541,7 +547,7 @@ void RGWBucketInfo::dump(Formatter *f) const
 {
   encode_json("bucket", bucket, f);
   encode_json("creation_time", creation_time, f);
-  encode_json("owner", owner, f);
+  encode_json("owner", owner.to_str(), f);
   encode_json("flags", flags, f);
   encode_json("region", region, f);
   encode_json("placement_rule", placement_rule, f);
@@ -573,7 +579,7 @@ void RGWObjEnt::dump(Formatter *f) const
   encode_json("name", key.name, f);
   encode_json("instance", key.instance, f);
   encode_json("namespace", ns, f);
-  encode_json("owner", owner, f);
+  encode_json("owner", owner.to_str(), f);
   encode_json("owner_display_name", owner_display_name, f);
   encode_json("size", size, f);
   encode_json("mtime", mtime, f);
@@ -844,3 +850,10 @@ void KeystoneToken::decode_json(JSONObj *access_obj)
   JSONDecoder::decode_json("user", user, access_obj, true);
   JSONDecoder::decode_json("serviceCatalog", service_catalog, access_obj);
 }
+
+void rgw_slo_entry::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("path", path, obj);
+  JSONDecoder::decode_json("etag", etag, obj);
+  JSONDecoder::decode_json("size_bytes", size_bytes, obj);
+};
diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc
index 795d787..8f85fde 100644
--- a/src/rgw/rgw_log.cc
+++ b/src/rgw/rgw_log.cc
@@ -7,6 +7,7 @@
 #include "common/OutputDataSocket.h"
 #include "common/Formatter.h"
 
+#include "rgw_bucket.h"
 #include "rgw_log.h"
 #include "rgw_acl.h"
 #include "rgw_rados.h"
@@ -175,14 +176,15 @@ static void log_usage(struct req_state *s, const string& op_name)
   if (!usage_logger)
     return;
 
-  string user;
+  rgw_user user;
 
-  if (!s->bucket_name_str.empty())
+  if (!s->bucket_name.empty())
     user = s->bucket_owner.get_id();
   else
     user = s->user.user_id;
 
-  rgw_usage_log_entry entry(user, s->bucket.name);
+  string id = user.to_str();
+  rgw_usage_log_entry entry(id, s->bucket.name);
 
   uint64_t bytes_sent = s->cio->get_bytes_sent();
   uint64_t bytes_received = s->cio->get_bytes_received();
@@ -207,8 +209,9 @@ void rgw_format_ops_log_entry(struct rgw_log_entry& entry, Formatter *formatter)
   entry.time.gmtime(formatter->dump_stream("time"));      // UTC
   entry.time.localtime(formatter->dump_stream("time_local"));
   formatter->dump_string("remote_addr", entry.remote_addr);
-  if (entry.object_owner.length())
-    formatter->dump_string("object_owner", entry.object_owner);
+  string obj_owner = entry.object_owner.to_str();
+  if (obj_owner.length())
+    formatter->dump_string("object_owner", obj_owner);
   formatter->dump_string("user", entry.user);
   formatter->dump_string("operation", entry.op);
   formatter->dump_string("uri", entry.uri);
@@ -273,7 +276,7 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsL
   if (!s->enable_ops_log)
     return 0;
 
-  if (s->bucket_name_str.empty()) {
+  if (s->bucket_name.empty()) {
     ldout(s->cct, 5) << "nothing to log for operation" << dendl;
     return -EINVAL;
   }
@@ -286,9 +289,9 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsL
   } else {
     bucket_id = s->bucket.bucket_id;
   }
-  entry.bucket = s->bucket_name_str;
+  rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, entry.bucket);
 
-  if (check_utf8(s->bucket_name_str.c_str(), entry.bucket.size()) != 0) {
+  if (check_utf8(s->bucket_name.c_str(), entry.bucket.size()) != 0) {
     ldout(s->cct, 5) << "not logging op on bucket with non-utf8 name" << dendl;
     return 0;
   }
@@ -310,7 +313,7 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsL
   set_param_str(s, "REQUEST_URI", entry.uri);
   set_param_str(s, "REQUEST_METHOD", entry.op);
 
-  entry.user = s->user.user_id;
+  entry.user = s->user.user_id.to_str();
   if (s->object_acl)
     entry.object_owner = s->object_acl->get_owner().get_id();
   entry.bucket_owner = s->bucket_owner.get_id();
diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h
index 3622737..51acfdf 100644
--- a/src/rgw/rgw_log.h
+++ b/src/rgw/rgw_log.h
@@ -12,8 +12,8 @@
 class RGWRados;
 
 struct rgw_log_entry {
-  string object_owner;
-  string bucket_owner;
+  rgw_user object_owner;
+  rgw_user bucket_owner;
   string bucket;
   utime_t time;
   string remote_addr;
@@ -32,9 +32,9 @@ struct rgw_log_entry {
   string bucket_id;
 
   void encode(bufferlist &bl) const {
-    ENCODE_START(7, 5, bl);
-    ::encode(object_owner, bl);
-    ::encode(bucket_owner, bl);
+    ENCODE_START(8, 5, bl);
+    ::encode(object_owner.id, bl);
+    ::encode(bucket_owner.id, bl);
     ::encode(bucket, bl);
     ::encode(time, bl);
     ::encode(remote_addr, bl);
@@ -52,13 +52,15 @@ struct rgw_log_entry {
     ::encode(bytes_received, bl);
     ::encode(bucket_id, bl);
     ::encode(obj, bl);
+    ::encode(object_owner, bl);
+    ::encode(bucket_owner, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator &p) {
-    DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, p);
-    ::decode(object_owner, p);
+    DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, p);
+    ::decode(object_owner.id, p);
     if (struct_v > 3)
-      ::decode(bucket_owner, p);
+      ::decode(bucket_owner.id, p);
     ::decode(bucket, p);
     ::decode(time, p);
     ::decode(remote_addr, p);
@@ -94,6 +96,10 @@ struct rgw_log_entry {
     if (struct_v >= 7) {
       ::decode(obj, p);
     }
+    if (struct_v >= 8) {
+      ::decode(object_owner, p);
+      ::decode(bucket_owner, p);
+    }
     DECODE_FINISH(p);
   }
   void dump(Formatter *f) const;
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index fca3ede..2246b6a 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -919,13 +919,15 @@ public:
 
     pprocess = pp;
 
-    string uid;
-    conf->get_val("uid", "", &uid);
-    if (uid.empty()) {
+    string uid_str;
+    conf->get_val("uid", "", &uid_str);
+    if (uid_str.empty()) {
       derr << "ERROR: uid param must be specified for loadgen frontend" << dendl;
       return EINVAL;
     }
 
+    rgw_user uid(uid_str);
+
     RGWUserInfo user_info;
     int ret = rgw_get_user_info_by_uid(env.store, uid, user_info, NULL);
     if (ret < 0) {
diff --git a/src/rgw/rgw_metadata.h b/src/rgw/rgw_metadata.h
index 8063cf7..bb77e65 100644
--- a/src/rgw/rgw_metadata.h
+++ b/src/rgw/rgw_metadata.h
@@ -108,6 +108,21 @@ protected:
     }
     return true;
   }
+
+  /*
+   * The tenant_name is always returned on purpose. May be empty, of course.
+   */
+  static void parse_bucket(const string &bucket,
+                           string &tenant_name, string &bucket_name)
+  {
+    int pos = bucket.find('/');
+    if (pos >= 0) {
+      tenant_name = bucket.substr(0, pos);
+    } else {
+      tenant_name.clear();
+    }
+    bucket_name = bucket.substr(pos + 1);
+  }
 };
 
 #define META_LOG_OBJ_PREFIX "meta.log."
diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc
index 5903511..14957d7 100644
--- a/src/rgw/rgw_object_expirer_core.cc
+++ b/src/rgw/rgw_object_expirer_core.cc
@@ -39,16 +39,22 @@ using namespace std;
 
 static string objexp_lock_name = "gc_process";
 
-int RGWObjectExpirer::init_bucket_info(const string& bucket_name,
-                                    const string& bucket_id,
-                                    RGWBucketInfo& bucket_info)
+int RGWObjectExpirer::init_bucket_info(const string& tenant_name,
+                                       const string& bucket_name,
+                                       const string& bucket_id,
+                                       RGWBucketInfo& bucket_info)
 {
   RGWObjectCtx obj_ctx(store);
-  const string bucket_instance_id = bucket_name + ":" + bucket_id;
 
+  /*
+   * XXX Here's where it gets tricky. We went to all the trouble of
+   * punching the tenant through the objexp_hint_entry, but now we
+   * find that our instances do not actually have tenants. They are
+   * unique thanks to IDs. So the tenant string is not needed...
+   */
+  const string bucket_instance_id = bucket_name + ":" + bucket_id;
   int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id,
           bucket_info, NULL, NULL);
-
   return ret;
 }
 
@@ -56,7 +62,8 @@ int RGWObjectExpirer::garbage_single_object(objexp_hint_entry& hint)
 {
   RGWBucketInfo bucket_info;
 
-  int ret = init_bucket_info(hint.bucket_name, hint.bucket_id, bucket_info);
+  int ret = init_bucket_info(hint.tenant, hint.bucket_name,
+          hint.bucket_id, bucket_info);
   if (-ENOENT == ret) {
     ldout(store->ctx(), 15) << "NOTICE: cannot find bucket = " \
         << hint.bucket_name << ". The object must be already removed" << dendl;
diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h
index bd137fa..c9d56da 100644
--- a/src/rgw/rgw_object_expirer_core.h
+++ b/src/rgw/rgw_object_expirer_core.h
@@ -41,7 +41,8 @@ class RGWObjectExpirer {
 protected:
   RGWRados *store;
 
-  int init_bucket_info(const string& bucket_name,
+  int init_bucket_info(const string& tenant_name,
+                       const string& bucket_name,
                        const string& bucket_id,
                        RGWBucketInfo& bucket_info);
 
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 993659a..d1e8b71 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -321,7 +321,7 @@ static int read_policy(RGWRados *store, struct req_state *s,
     ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, &bucket_policy, no_obj);
     if (ret < 0)
       return ret;
-    string& owner = bucket_policy.get_owner().get_id();
+    rgw_user& owner = bucket_policy.get_owner().get_id();
     if (!s->system_request && owner.compare(s->user.user_id) != 0 &&
         !bucket_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_READ))
       ret = -EACCES;
@@ -365,35 +365,30 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu
     s->bucket_acl = new RGWAccessControlPolicy(s->cct);
   }
 
-  if (s->copy_source) { /* check if copy source is within the current domain */
-    const char *src = s->copy_source;
-    if (*src == '/')
-      ++src;
-    string copy_source_str(src);
-
-    int pos = copy_source_str.find('/');
-    if (pos > 0)
-      copy_source_str = copy_source_str.substr(0, pos);
-
+  /* check if copy source is within the current domain */
+  if (!s->src_bucket_name.empty()) {
     RGWBucketInfo source_info;
 
-    ret = store->get_bucket_info(obj_ctx, copy_source_str, source_info, NULL);
+    ret = store->get_bucket_info(obj_ctx,
+        s->src_tenant_name, s->src_bucket_name, source_info, NULL);
     if (ret == 0) {
       string& region = source_info.region;
       s->local_source = store->region.equals(region);
     }
   }
 
-  if (!s->bucket_name_str.empty()) {
+  if (!s->bucket_name.empty()) {
     s->bucket_exists = true;
     if (s->bucket_instance_id.empty()) {
-      ret = store->get_bucket_info(obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs);
+      ret = store->get_bucket_info(obj_ctx, s->bucket_tenant, s->bucket_name, s->bucket_info, NULL, &s->bucket_attrs);
     } else {
       ret = store->get_bucket_instance_info(obj_ctx, s->bucket_instance_id, s->bucket_info, NULL, &s->bucket_attrs);
     }
     if (ret < 0) {
       if (ret != -ENOENT) {
-        ldout(s->cct, 0) << "NOTICE: couldn't get bucket from bucket_name (name=" << s->bucket_name_str << ")" << dendl;
+        string bucket_log;
+        rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, bucket_log);
+        ldout(s->cct, 0) << "NOTICE: couldn't get bucket from bucket_name (name=" << bucket_log << ")" << dendl;
         return ret;
       }
       s->bucket_exists = false;
@@ -664,7 +659,11 @@ bool RGWOp::generate_cors_headers(string& origin, string& method, string& header
   return true;
 }
 
-int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs)
+int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket,
+                                       const RGWObjEnt& ent,
+                                       RGWAccessControlPolicy * const bucket_policy,
+                                       const off_t start_ofs,
+                                       const off_t end_ofs)
 {
   ldout(s->cct, 20) << "user manifest obj=" << ent.key.name << "[" << ent.key.instance << "]" << dendl;
 
@@ -688,6 +687,7 @@ int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAc
   RGWRados::Object op_target(store, s->bucket_info, obj_ctx, part);
   RGWRados::Object::Read read_op(&op_target);
 
+  read_op.conds.if_match = ent.etag.c_str();
   read_op.params.attrs = &attrs;
   read_op.params.obj_size = &obj_size;
   read_op.params.perr = &s->err;
@@ -719,7 +719,6 @@ int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAc
 
     off_t len = bl.length();
     cur_ofs += len;
-    ofs += len;
     ret = 0;
     perfcounter->tinc(l_rgw_get_lat,
                       (ceph_clock_now(s->cct) - start_time));
@@ -731,14 +730,25 @@ int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAc
   return 0;
 }
 
-static int iterate_user_manifest_parts(CephContext *cct, RGWRados *store, off_t ofs, off_t end,
-                                       rgw_bucket& bucket, string& obj_prefix, RGWAccessControlPolicy *bucket_policy,
-                                       uint64_t *ptotal_len,
-                                       int (*cb)(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy,
-                                                 off_t start_ofs, off_t end_ofs, void *param), void *cb_param)
+static int iterate_user_manifest_parts(CephContext * const cct,
+                                       RGWRados * const store,
+                                       const off_t ofs,
+                                       const off_t end,
+                                       rgw_bucket& bucket,
+                                       const string& obj_prefix,
+                                       RGWAccessControlPolicy * const bucket_policy,
+                                       uint64_t * const ptotal_len,
+                                       uint64_t * const pobj_size,
+                                       int (*cb)(rgw_bucket& bucket,
+                                                 const RGWObjEnt& ent,
+                                                 RGWAccessControlPolicy * const bucket_policy,
+                                                 off_t start_ofs,
+                                                 off_t end_ofs,
+                                                 void *param),
+                                       void * const cb_param)
 {
   uint64_t obj_ofs = 0, len_count = 0;
-  bool found_start = false, found_end = false;
+  bool found_start = false, found_end = false, handled_end = false;
   string delim;
   bool is_truncated;
   vector<RGWObjEnt> objs;
@@ -759,7 +769,7 @@ static int iterate_user_manifest_parts(CephContext *cct, RGWRados *store, off_t
 
     vector<RGWObjEnt>::iterator viter;
 
-    for (viter = objs.begin(); viter != objs.end() && !found_end; ++viter) {
+    for (viter = objs.begin(); viter != objs.end(); ++viter) {
       RGWObjEnt& ent = *viter;
       uint64_t cur_total_len = obj_ofs;
       uint64_t start_ofs = 0, end_ofs = ent.size;
@@ -779,28 +789,119 @@ static int iterate_user_manifest_parts(CephContext *cct, RGWRados *store, off_t
       perfcounter->tinc(l_rgw_get_lat,
                        (ceph_clock_now(cct) - start_time));
 
-      if (found_start) {
+      if (found_start && !handled_end) {
         len_count += end_ofs - start_ofs;
 
         if (cb) {
           r = cb(bucket, ent, bucket_policy, start_ofs, end_ofs, cb_param);
-          if (r < 0)
+          if (r < 0) {
             return r;
+          }
         }
       }
 
+      handled_end = found_end;
       start_time = ceph_clock_now(cct);
     }
-  } while (is_truncated && !found_end);
+  } while (is_truncated);
 
-  if (ptotal_len)
+  if (ptotal_len) {
     *ptotal_len = len_count;
+  }
+  if (pobj_size) {
+    *pobj_size = obj_ofs;
+  }
+
+  return 0;
+}
+
+struct rgw_slo_part {
+  RGWAccessControlPolicy *bucket_policy;
+  rgw_bucket bucket;
+  string obj_name;
+  uint64_t size;
+  string etag;
+
+  rgw_slo_part() : bucket_policy(NULL), size(0) {}
+};
+
+static int iterate_slo_parts(CephContext *cct,
+                             RGWRados *store,
+                             off_t ofs,
+                             off_t end,
+                             map<uint64_t, rgw_slo_part>& slo_parts,
+                             int (*cb)(rgw_bucket& bucket,
+                                       const RGWObjEnt& ent,
+                                       RGWAccessControlPolicy *bucket_policy,
+                                       off_t start_ofs,
+                                       off_t end_ofs,
+                                       void *param),
+                             void *cb_param)
+{
+  bool found_start = false, found_end = false;
+  string delim;
+  vector<RGWObjEnt> objs;
+
+  if (slo_parts.empty()) {
+    return 0;
+  }
+
+
+  utime_t start_time = ceph_clock_now(cct);
+
+  map<uint64_t, rgw_slo_part>::iterator iter = slo_parts.upper_bound(ofs);
+  if (iter != slo_parts.begin()) {
+    --iter;
+  }
+
+  uint64_t obj_ofs = iter->first;
+
+  for (; iter != slo_parts.end() && !found_end; ++iter) {
+    rgw_slo_part& part = iter->second;
+    RGWObjEnt ent;
+
+    ent.key.name = part.obj_name;
+    ent.size = part.size;
+    ent.etag = part.etag;
+
+    uint64_t cur_total_len = obj_ofs;
+    uint64_t start_ofs = 0, end_ofs = ent.size;
+
+    if (!found_start && cur_total_len + ent.size > (uint64_t)ofs) {
+      start_ofs = ofs - obj_ofs;
+      found_start = true;
+    }
+
+    obj_ofs += ent.size;
+
+    if (!found_end && obj_ofs > (uint64_t)end) {
+      end_ofs = end - cur_total_len + 1;
+      found_end = true;
+    }
+
+    perfcounter->tinc(l_rgw_get_lat,
+                      (ceph_clock_now(cct) - start_time));
+
+    if (found_start) {
+      if (cb) {
+        int r = cb(part.bucket, ent, part.bucket_policy, start_ofs, end_ofs, cb_param);
+        if (r < 0)
+          return r;
+      }
+    }
+
+    start_time = ceph_clock_now(cct);
+  }
 
   return 0;
 }
 
-static int get_obj_user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs,
-                                       void *param)
+static int get_obj_user_manifest_iterate_cb(rgw_bucket& bucket,
+                                            const RGWObjEnt& ent,
+                                            RGWAccessControlPolicy * const bucket_policy,
+                                            const off_t start_ofs,
+                                            const off_t end_ofs,
+                                            void * const param)
 {
   RGWGetObj *op = static_cast<RGWGetObj *>(param);
   return op->read_user_manifest_part(bucket, ent, bucket_policy, start_ofs, end_ofs);
@@ -832,7 +933,8 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
     RGWBucketInfo bucket_info;
     map<string, bufferlist> bucket_attrs;
     RGWObjectCtx obj_ctx(store);
-    int r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL, &bucket_attrs);
+    int r = store->get_bucket_info(obj_ctx, s->user.user_id.tenant, bucket_name,
+                                   bucket_info, NULL, &bucket_attrs);
     if (r < 0) {
       ldout(s->cct, 0) << "could not get bucket info for bucket=" << bucket_name << dendl;
       return r;
@@ -851,11 +953,12 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
   }
 
   /* dry run to find out total length */
-  int r = iterate_user_manifest_parts(s->cct, store, ofs, end, bucket, obj_prefix, bucket_policy, &total_len, NULL, NULL);
-  if (r < 0)
+  int r = iterate_user_manifest_parts(s->cct, store, ofs, end,
+        bucket, obj_prefix, bucket_policy, &total_len, &s->obj_size,
+        NULL, NULL);
+  if (r < 0) {
     return r;
-
-  s->obj_size = total_len;
+  }
 
   if (!get_data) {
     bufferlist bl;
@@ -863,9 +966,117 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
     return 0;
   }
 
-  r = iterate_user_manifest_parts(s->cct, store, ofs, end, bucket, obj_prefix, bucket_policy, NULL, get_obj_user_manifest_iterate_cb, (void *)this);
-  if (r < 0)
+  r = iterate_user_manifest_parts(s->cct, store, ofs, end,
+        bucket, obj_prefix, bucket_policy, NULL, NULL,
+        get_obj_user_manifest_iterate_cb, (void *)this);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWGetObj::handle_slo_manifest(bufferlist& bl)
+{
+  RGWSLOInfo slo_info;
+  bufferlist::iterator bliter = bl.begin();
+  try {
+    ::decode(slo_info, bliter);
+  } catch (buffer::error& err) {
+    ldout(s->cct, 0) << "ERROR: failed to decode slo manifest" << dendl;
+    return -EIO;
+  }
+  ldout(s->cct, 2) << "RGWGetObj::handle_slo_manifest()" << dendl;
+
+  list<RGWAccessControlPolicy> allocated_policies;
+  map<string, RGWAccessControlPolicy *> policies;
+  map<string, rgw_bucket> buckets;
+
+  map<uint64_t, rgw_slo_part> slo_parts;
+
+  total_len = 0;
+
+  for (vector<rgw_slo_entry>::iterator iter = slo_info.entries.begin(); iter != slo_info.entries.end(); ++iter) {
+    string& path = iter->path;
+    int pos = path.find('/', 1); /* skip first / */
+    if (pos < 0)
+      return -EINVAL;
+
+    string bucket_name = path.substr(1, pos - 1);
+    string obj_name = path.substr(pos + 1);
+
+    rgw_bucket bucket;
+    RGWAccessControlPolicy *bucket_policy;
+
+    if (bucket_name.compare(s->bucket.name) != 0) {
+      map<string, RGWAccessControlPolicy *>::iterator piter = policies.find(bucket_name);
+      if (piter != policies.end()) {
+        bucket_policy = piter->second;
+        bucket = buckets[bucket_name];
+      } else {
+        allocated_policies.push_back(RGWAccessControlPolicy(s->cct));
+        RGWAccessControlPolicy& _bucket_policy = allocated_policies.back();
+
+        RGWBucketInfo bucket_info;
+        map<string, bufferlist> bucket_attrs;
+        RGWObjectCtx obj_ctx(store);
+        int r = store->get_bucket_info(obj_ctx, s->user.user_id.tenant,
+              bucket_name, bucket_info, NULL, &bucket_attrs);
+        if (r < 0) {
+          ldout(s->cct, 0) << "could not get bucket info for bucket=" << bucket_name << dendl;
+          return r;
+        }
+        bucket = bucket_info.bucket;
+        rgw_obj_key no_obj;
+        bucket_policy = &_bucket_policy;
+        r = read_policy(store, s, bucket_info, bucket_attrs, bucket_policy, bucket, no_obj);
+        if (r < 0) {
+          ldout(s->cct, 0) << "failed to read bucket policy for bucket " << bucket << dendl;
+          return r;
+        }
+        buckets[bucket_name] = bucket;
+        policies[bucket_name] = bucket_policy;
+      }
+    } else {
+      bucket = s->bucket;
+      bucket_policy = s->bucket_acl;
+    }
+
+    rgw_slo_part part;
+    part.bucket_policy = bucket_policy;
+    part.bucket = bucket;
+    part.obj_name = obj_name;
+    part.size = iter->size_bytes;
+    part.etag = iter->etag;
+    ldout(s->cct, 20) << "slo_part: ofs=" << ofs
+                      << " bucket=" << part.bucket
+                      << " obj=" << part.obj_name
+                      << " size=" << part.size
+                      << " etag=" << part.etag
+                      << dendl;
+
+    slo_parts[total_len] = part;
+    total_len += part.size;
+  }
+
+  s->obj_size = slo_info.total_size;
+  ldout(s->cct, 20) << "s->obj_size=" << s->obj_size << dendl;
+
+  if (ofs < 0) {
+    ofs = total_len - std::min(-ofs, static_cast<off_t>(total_len));
+  }
+
+  if (end < 0 || end >= static_cast<off_t>(total_len)) {
+    end = total_len - 1;
+  }
+
+  total_len = end - ofs + 1;
+
+  int r = iterate_slo_parts(s->cct, store, ofs, end, slo_parts,
+        get_obj_user_manifest_iterate_cb, (void *)this);
+  if (r < 0) {
     return r;
+  }
 
   return 0;
 }
@@ -998,6 +1209,16 @@ void RGWGetObj::execute()
     }
     return;
   }
+  attr_iter = attrs.find(RGW_ATTR_SLO_MANIFEST);
+  if (attr_iter != attrs.end()) {
+    is_slo = true;
+    ret = handle_slo_manifest(attr_iter->second);
+    if (ret < 0) {
+      ldout(s->cct, 0) << "ERROR: failed to handle slo manifest ret=" << ret << dendl;
+      goto done_err;
+    }
+    return;
+  }
 
   /* Check whether the object has expired. Swift API documentation
    * stands that we should return 404 Not Found in such case. */
@@ -1342,6 +1563,11 @@ int RGWCreateBucket::verify_permission()
   if (!rgw_user_is_authenticated(s->user))
     return -EACCES;
 
+  if (s->user.user_id.tenant != s->bucket_tenant) {
+    ldout(s->cct, 10) << "user cannot create a bucket in a different tenant (user_id.tenant=" << s->user.user_id.tenant << " requested=" << s->bucket_tenant << ")" << dendl;
+    return -EACCES;
+  }
+
   if (s->user.max_buckets) {
     RGWUserBuckets buckets;
     string marker;
@@ -1366,8 +1592,9 @@ static int forward_request_to_master(struct req_state *s, obj_version *objv, RGW
   }
   ldout(s->cct, 0) << "sending create_bucket request to master region" << dendl;
   bufferlist response;
+  string uid_str = s->user.user_id.to_str();
 #define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
-  int ret = store->rest_master_conn->forward(s->user.user_id, s->info, objv, MAX_REST_RESPONSE, &in_data, &response);
+  int ret = store->rest_master_conn->forward(uid_str, s->info, objv, MAX_REST_RESPONSE, &in_data, &response);
   if (ret < 0)
     return ret;
 
@@ -1393,7 +1620,9 @@ void RGWCreateBucket::execute()
   bufferlist aclbl;
   bufferlist corsbl;
   bool existed;
-  rgw_obj obj(store->zone.domain_root, s->bucket_name_str);
+  string bucket_name;
+  rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, bucket_name);
+  rgw_obj obj(store->zone.domain_root, bucket_name);
   obj_version objv, *pobjv = NULL;
 
   ret = get_params();
@@ -1409,7 +1638,8 @@ void RGWCreateBucket::execute()
 
   /* we need to make sure we read bucket info, it's not read before for this specific request */
   RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
-  ret = store->get_bucket_info(obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs);
+  ret = store->get_bucket_info(obj_ctx, s->bucket_tenant, s->bucket_name,
+                               s->bucket_info, NULL, &s->bucket_attrs);
   if (ret < 0 && ret != -ENOENT)
     return;
   s->bucket_exists = (ret != -ENOENT);
@@ -1464,7 +1694,9 @@ void RGWCreateBucket::execute()
   if (s->bucket_exists) {
     string selected_placement_rule;
     rgw_bucket bucket;
-    ret = store->select_bucket_placement(s->user, region_name, placement_rule, s->bucket_name_str, bucket, &selected_placement_rule);
+    ret = store->select_bucket_placement(s->user, region_name, placement_rule,
+                                         s->bucket_tenant, s->bucket_name, bucket,
+                                         &selected_placement_rule);
     if (selected_placement_rule != s->bucket_info.placement_rule) {
       ret = -EEXIST;
       return;
@@ -1479,7 +1711,8 @@ void RGWCreateBucket::execute()
     cors_config.encode(corsbl);
     attrs[RGW_ATTR_CORS] = corsbl;
   }
-  s->bucket.name = s->bucket_name_str;
+  s->bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */
+  s->bucket.name = s->bucket_name;
   ret = store->create_bucket(s->user, s->bucket, region_name, placement_rule, attrs, info, pobjv,
                              &ep_objv, creation_time, pmaster_bucket, true);
   /* continue if EEXIST and create_bucket will fail below.  this way we can recover
@@ -1507,7 +1740,7 @@ void RGWCreateBucket::execute()
 
   ret = rgw_link_bucket(store, s->user.user_id, s->bucket, info.creation_time, false);
   if (ret && !existed && ret != -EEXIST) {  /* if it exists (or previously existed), don't remove it! */
-    ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name);
+    ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.tenant, s->bucket.name);
     if (ret < 0) {
       ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl;
     }
@@ -1533,7 +1766,7 @@ void RGWDeleteBucket::execute()
 {
   ret = -EINVAL;
 
-  if (s->bucket_name_str.empty())
+  if (s->bucket_name.empty())
     return;
 
   RGWObjVersionTracker ot;
@@ -1559,7 +1792,7 @@ void RGWDeleteBucket::execute()
   ret = store->delete_bucket(s->bucket, ot);
 
   if (ret == 0) {
-    ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false);
+    ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.tenant, s->bucket.name, false);
     if (ret < 0) {
       ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl;
     }
@@ -1821,6 +2054,33 @@ static void encode_delete_at_attr(time_t delete_at, map<string, bufferlist>& att
   attrs[RGW_ATTR_DELETE_AT] = delatbl;
 }
 
+static int encode_dlo_manifest_attr(const char * const dlo_manifest,
+                                    map<string, bufferlist>& attrs)
+{
+  string dm = dlo_manifest;
+
+  if (dm.find('/') == string::npos) {
+    return -EINVAL;
+  }
+
+  bufferlist manifest_bl;
+  manifest_bl.append(dlo_manifest, strlen(dlo_manifest) + 1);
+  attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl;
+
+  return 0;
+}
+
+static void complete_etag(MD5& hash, string *etag)
+{
+  char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+
+  hash.Final((byte *)etag_buf);
+  buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE, etag_buf_str);
+
+  *etag = etag_buf_str;
+}
+
 void RGWPutObj::execute()
 {
   RGWPutObjProcessor *processor = NULL;
@@ -1835,7 +2095,7 @@ void RGWPutObj::execute()
   map<string, string>::iterator iter;
   bool multipart;
 
-  bool need_calc_md5 = (obj_manifest == NULL);
+  bool need_calc_md5 = (dlo_manifest == NULL) && (slo_info == NULL);
 
 
   perfcounter->inc(l_rgw_put);
@@ -1959,6 +2219,7 @@ void RGWPutObj::execute()
     goto done;
   }
   s->obj_size = ofs;
+
   perfcounter->inc(l_rgw_put_b, s->obj_size);
 
   ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
@@ -1970,48 +2231,41 @@ void RGWPutObj::execute()
 
   if (need_calc_md5) {
     processor->complete_hash(&hash);
-    hash.Final(m);
+  }
+  hash.Final(m);
 
-    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
-    etag = calc_md5;
+  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+  etag = calc_md5;
 
-    if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) {
-      ret = -ERR_BAD_DIGEST;
-      goto done;
-    }
+  if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) {
+    ret = -ERR_BAD_DIGEST;
+    goto done;
   }
 
   policy.encode(aclbl);
 
   attrs[RGW_ATTR_ACL] = aclbl;
-  if (obj_manifest) {
-    bufferlist manifest_bl;
-    string manifest_obj_prefix;
-    string manifest_bucket;
-
-    char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE];
-    char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
-
-    manifest_bl.append(obj_manifest, strlen(obj_manifest) + 1);
-    attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl;
-    user_manifest_parts_hash = &hash;
-    string prefix_str = obj_manifest;
-    int pos = prefix_str.find('/');
-    if (pos < 0) {
-      ldout(s->cct, 0) << "bad user manifest, missing slash separator: " << obj_manifest << dendl;
+
+  if (dlo_manifest) {
+    ret = encode_dlo_manifest_attr(dlo_manifest, attrs);
+    if (ret < 0) {
+      ldout(s->cct, 0) << "bad user manifest: " << dlo_manifest << dendl;
       goto done;
     }
+    complete_etag(hash, &etag);
+    ldout(s->cct, 10) << __func__ << ": calculated md5 for user manifest: " << etag << dendl;
+  }
 
-    manifest_bucket = prefix_str.substr(0, pos);
-    manifest_obj_prefix = prefix_str.substr(pos + 1);
-
-    hash.Final((byte *)etag_buf);
-    buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE, etag_buf_str);
-
-    ldout(s->cct, 0) << __func__ << ": calculated md5 for user manifest: " << etag_buf_str << dendl;
+  if (slo_info) {
+    bufferlist manifest_bl;
+    ::encode(*slo_info, manifest_bl);
+    attrs[RGW_ATTR_SLO_MANIFEST] = manifest_bl;
 
-    etag = etag_buf_str;
+    hash.Update((byte *)slo_info->raw_data, slo_info->raw_data_len);
+    complete_etag(hash, &etag);
+    ldout(s->cct, 10) << __func__ << ": calculated md5 for user manifest: " << etag << dendl;
   }
+
   if (supplied_etag && etag.compare(supplied_etag) != 0) {
     ret = -ERR_UNPROCESSABLE_ENTITY;
     goto done;
@@ -2028,6 +2282,15 @@ void RGWPutObj::execute()
   rgw_get_request_metadata(s->cct, s->info, attrs);
   encode_delete_at_attr(delete_at, attrs);
 
+  /* Add a custom metadata to expose the information whether an object
+   * is an SLO or not. Appending the attribute must be performed AFTER
+   * processing any input from user in order to prohibit overwriting. */
+  if (slo_info) {
+    bufferlist slo_userindicator_bl;
+    ::encode("True", slo_userindicator_bl);
+    attrs[RGW_ATTR_SLO_UINDICATOR] = slo_userindicator_bl;
+  }
+
   ret = processor->complete(etag, &mtime, 0, attrs, delete_at, if_match, if_nomatch);
 
 done:
@@ -2243,6 +2506,9 @@ int RGWPutMetadataAccount::verify_permission()
   if (!rgw_user_is_authenticated(s->user)) {
     return -EACCES;
   }
+  // if ((s->perm_mask & RGW_PERM_WRITE) == 0) {
+  //   return -EACCES;
+  // }
   return 0;
 }
 
@@ -2279,15 +2545,9 @@ void RGWPutMetadataAccount::filter_out_temp_url(map<string, bufferlist>& add_att
 
 void RGWPutMetadataAccount::execute()
 {
-  rgw_obj obj;
   map<string, bufferlist> attrs, orig_attrs, rmattrs;
   RGWObjVersionTracker acct_op_tracker;
 
-  /* Get the name of raw object which stores the metadata in its xattrs. */
-  string buckets_obj_id;
-  rgw_get_buckets_obj(s->user.user_id, buckets_obj_id);
-  obj = rgw_obj(store->zone.user_uid_pool, buckets_obj_id);
-
   ret = get_params();
   if (ret < 0) {
     return;
@@ -2308,7 +2568,8 @@ void RGWPutMetadataAccount::execute()
     }
   }
 
-  ret = rgw_store_user_attrs(store, s->user.user_id, attrs, &rmattrs, &acct_op_tracker);
+  /* XXX tenant needed? */
+  ret = rgw_store_user_attrs(store, s->user.user_id.id, attrs, &rmattrs, &acct_op_tracker);
   if (ret < 0) {
     return;
   }
@@ -2337,7 +2598,6 @@ void RGWPutMetadataBucket::pre_exec()
 
 void RGWPutMetadataBucket::execute()
 {
-  rgw_obj obj(s->bucket, s->object);
   map<string, bufferlist> attrs, orig_attrs, rmattrs;
 
   ret = get_params();
@@ -2418,9 +2678,72 @@ void RGWPutMetadataObject::execute()
   populate_with_generic_attrs(s, attrs);
   encode_delete_at_attr(delete_at, attrs);
 
+  if (dlo_manifest) {
+    ret = encode_dlo_manifest_attr(dlo_manifest, attrs);
+    if (ret < 0) {
+      ldout(s->cct, 0) << "bad user manifest: " << dlo_manifest << dendl;
+      return;
+    }
+  }
+
   ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, NULL);
 }
 
+int RGWDeleteObj::handle_slo_manifest(bufferlist& bl)
+{
+  RGWSLOInfo slo_info;
+  bufferlist::iterator bliter = bl.begin();
+  try {
+    ::decode(slo_info, bliter);
+  } catch (buffer::error& err) {
+    ldout(s->cct, 0) << "ERROR: failed to decode slo manifest" << dendl;
+    return -EIO;
+  }
+
+  try {
+    deleter = std::unique_ptr<RGWBulkDelete::Deleter>(\
+          new RGWBulkDelete::Deleter(store, s));
+  } catch (std::bad_alloc) {
+    return -ENOMEM;
+  }
+
+  list<RGWBulkDelete::acct_path_t> items;
+  for (const auto& iter : slo_info.entries) {
+    const string& path_str = iter.path;
+
+    const size_t sep_pos = path_str.find('/', 1 /* skip first slash */);
+    if (string::npos == sep_pos) {
+      return -EINVAL;
+    }
+
+    RGWBulkDelete::acct_path_t path;
+
+    string bucket_name;
+    url_decode(path_str.substr(1, sep_pos - 1), bucket_name);
+
+    string obj_name;
+    url_decode(path_str.substr(sep_pos + 1), obj_name);
+
+    path.bucket_name = bucket_name;
+    path.obj_key = obj_name;
+
+    items.push_back(path);
+  }
+
+  /* Request removal of the manifest object itself. */
+  RGWBulkDelete::acct_path_t path;
+  path.bucket_name = s->bucket_name;
+  path.obj_key = s->object;
+  items.push_back(path);
+
+  int ret = deleter->delete_chunk(items);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
 int RGWDeleteObj::verify_permission()
 {
   if (!verify_bucket_permission(s, RGW_PERM_WRITE))
@@ -2438,16 +2761,37 @@ void RGWDeleteObj::execute()
 {
   ret = -EINVAL;
   rgw_obj obj(s->bucket, s->object);
-  map<string, bufferlist> orig_attrs;
+  map<string, bufferlist> attrs;
+
+  ret = get_params();
+  if (ret < 0) {
+    return;
+  }
 
   if (!s->object.empty()) {
-    if (need_object_expiration()) {
+    if (need_object_expiration() || multipart_delete) {
       /* check if obj exists, read orig attrs */
-      ret = get_obj_attrs(store, s, obj, orig_attrs);
+      ret = get_obj_attrs(store, s, obj, attrs);
       if (ret < 0) {
         return;
       }
     }
+
+    if (multipart_delete) {
+      const auto slo_attr = attrs.find(RGW_ATTR_SLO_MANIFEST);
+
+      if (slo_attr != attrs.end()) {
+        ret = handle_slo_manifest(slo_attr->second);
+        if (ret < 0) {
+          ldout(s->cct, 0) << "ERROR: failed to handle slo manifest ret=" << ret << dendl;
+        }
+      } else {
+        ret = -ERR_NOT_SLO_MANIFEST;
+      }
+
+      return;
+    }
+
     RGWObjectCtx *obj_ctx = static_cast<RGWObjectCtx *>(s->obj_ctx);
 
     obj_ctx->set_atomic(obj);
@@ -2459,6 +2803,7 @@ void RGWDeleteObj::execute()
     if (ret < 0) {
       return;
     }
+
     del_op.params.bucket_owner = s->bucket_owner.get_id();
     del_op.params.versioning_status = s->bucket_info.versioning_status();
     del_op.params.obj_owner = s->owner;
@@ -2471,7 +2816,7 @@ void RGWDeleteObj::execute()
 
     /* Check whether the object has expired. Swift API documentation
      * stands that we should return 404 Not Found in such case. */
-    if (need_object_expiration() && object_is_expired(orig_attrs)) {
+    if (need_object_expiration() && object_is_expired(attrs)) {
       ret = -ENOENT;
       return;
     }
@@ -2492,7 +2837,6 @@ bool RGWCopyObj::parse_copy_location(const string& url_src, string& bucket_name,
     params_str = url_src.substr(pos + 1);
   }
 
-
   string dec_src;
 
   url_decode(name_str, dec_src);
@@ -2502,7 +2846,7 @@ bool RGWCopyObj::parse_copy_location(const string& url_src, string& bucket_name,
 
   string str(src);
 
-  pos = str.find("/");
+  pos = str.find('/');
   if (pos <= 0)
     return false;
 
@@ -2539,7 +2883,8 @@ int RGWCopyObj::verify_permission()
 
   RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
-  ret = store->get_bucket_info(obj_ctx, src_bucket_name, src_bucket_info, NULL, &src_attrs);
+  ret = store->get_bucket_info(obj_ctx, src_tenant_name, src_bucket_name,
+                               src_bucket_info, NULL, &src_attrs);
   if (ret < 0)
     return ret;
 
@@ -2568,7 +2913,8 @@ int RGWCopyObj::verify_permission()
     dest_bucket_info = src_bucket_info;
     dest_attrs = src_attrs;
   } else {
-    ret = store->get_bucket_info(obj_ctx, dest_bucket_name, dest_bucket_info, NULL, &dest_attrs);
+    ret = store->get_bucket_info(obj_ctx, dest_tenant_name, dest_bucket_name,
+                                 dest_bucket_info, NULL, &dest_attrs);
     if (ret < 0)
       return ret;
   }
@@ -3733,6 +4079,198 @@ error:
 
 }
 
+bool RGWBulkDelete::Deleter::verify_permission(RGWBucketInfo& binfo,
+                                               map<string, bufferlist>& battrs,
+                                               rgw_obj& obj,
+                                               ACLOwner& bucket_owner /* out */)
+{
+  int ret = 0;
+
+  RGWAccessControlPolicy bacl(store->ctx());
+  rgw_obj_key no_obj;
+  ret = read_policy(store, s, binfo, battrs, &bacl, binfo.bucket, no_obj);
+  if (ret < 0) {
+    return false;
+  }
+
+  RGWAccessControlPolicy oacl(s->cct);
+  ret = read_policy(store, s, binfo, battrs, &oacl, binfo.bucket, s->object);
+  if (ret < 0) {
+    return false;
+  }
+
+  bucket_owner = bacl.get_owner();
+
+  return verify_object_permission(s, &bacl, &oacl, RGW_PERM_WRITE);
+}
+
+bool RGWBulkDelete::Deleter::verify_permission(RGWBucketInfo& binfo,
+                                               map<string, bufferlist>& battrs)
+{
+  int ret = 0;
+
+  RGWAccessControlPolicy bacl(store->ctx());
+  rgw_obj_key no_obj;
+  ret = read_policy(store, s, binfo, battrs, &bacl, binfo.bucket, no_obj);
+  if (ret < 0) {
+    return false;
+  }
+
+  return verify_bucket_permission(s, &bacl, RGW_PERM_WRITE);
+}
+
+bool RGWBulkDelete::Deleter::delete_single(const acct_path_t& path)
+{
+  int ret = 0;
+  auto& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
+
+  RGWBucketInfo binfo;
+  map<string, bufferlist> battrs;
+  ret = store->get_bucket_info(obj_ctx, s->user.user_id.tenant,
+      path.bucket_name, binfo, NULL, &battrs);
+  if (ret < 0) {
+    goto binfo_fail;
+  }
+
+  if (!path.obj_key.empty()) {
+    rgw_obj obj(binfo.bucket, path.obj_key);
+    obj_ctx.set_atomic(obj);
+
+    RGWRados::Object del_target(store, binfo, obj_ctx, obj);
+    RGWRados::Object::Delete del_op(&del_target);
+
+    ACLOwner owner;
+    if (!verify_permission(binfo, battrs, obj, owner)) {
+      ret = -EACCES;
+      goto auth_fail;
+    }
+
+    del_op.params.bucket_owner = binfo.owner;
+    del_op.params.versioning_status = binfo.versioning_status();
+    del_op.params.obj_owner = owner;
+
+    ret = del_op.delete_obj();
+    if (ret < 0) {
+      goto delop_fail;
+    }
+  } else {
+    RGWObjVersionTracker ot;
+    ot.read_version = binfo.ep_objv;
+
+    if (!verify_permission(binfo, battrs)) {
+      ret = -EACCES;
+      goto auth_fail;
+    }
+
+    ret = store->delete_bucket(binfo.bucket, ot);
+    if (0 == ret) {
+      ret = rgw_unlink_bucket(store, binfo.owner, binfo.bucket.tenant, binfo.bucket.name, false);
+      if (ret < 0) {
+        ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl;
+      }
+    }
+    if (ret < 0) {
+      goto delop_fail;
+    }
+
+    if (!store->region.is_master) {
+      bufferlist in_data;
+      JSONParser jp;
+      ret = forward_request_to_master(s, &ot.read_version, store, in_data, &jp);
+      if (ret < 0) {
+        if (ret == -ENOENT) { /* adjust error,
+                               we want to return with NoSuchBucket and not NoSuchKey */
+          ret = -ERR_NO_SUCH_BUCKET;
+        }
+        goto delop_fail;
+      }
+    }
+  }
+
+  num_deleted++;
+  return true;
+
+
+binfo_fail:
+    if (-ENOENT == ret) {
+      ldout(store->ctx(), 20) << "cannot find bucket = " << path.bucket_name << dendl;
+      num_unfound++;
+    } else {
+      ldout(store->ctx(), 20) << "cannot get bucket info, ret = " << ret << dendl;
+
+      fail_desc_t failed_item = {
+        .err  = ret,
+        .path = path
+      };
+      failures.push_back(failed_item);
+    }
+    return false;
+
+auth_fail:
+    ldout(store->ctx(), 20) << "wrong auth for " << path << dendl;
+    {
+      fail_desc_t failed_item = {
+        .err  = ret,
+        .path = path
+      };
+      failures.push_back(failed_item);
+    }
+    return false;
+
+delop_fail:
+    if (-ENOENT == ret) {
+      ldout(store->ctx(), 20) << "cannot find entry " << path << dendl;
+      num_unfound++;
+    } else {
+      fail_desc_t failed_item = {
+        .err  = ret,
+        .path = path
+      };
+      failures.push_back(failed_item);
+    }
+    return false;
+}
+
+bool RGWBulkDelete::Deleter::delete_chunk(const std::list<acct_path_t>& paths)
+{
+  ldout(store->ctx(), 20) << "in delete_chunk" << dendl;
+  for (auto path : paths) {
+    ldout(store->ctx(), 20) << "bulk deleting path: " << path << dendl;
+    delete_single(path);
+  }
+
+  return true;
+}
+
+int RGWBulkDelete::verify_permission()
+{
+  return 0;
+}
+
+void RGWBulkDelete::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWBulkDelete::execute()
+{
+  deleter = std::unique_ptr<Deleter>(new Deleter(store, s));
+
+  bool is_truncated = false;
+  do {
+    list<RGWBulkDelete::acct_path_t> items;
+
+    int ret = get_data(items, &is_truncated);
+    if (ret < 0) {
+      return;
+    }
+
+    ret = deleter->delete_chunk(items);
+  } while (!ret && is_truncated);
+
+  return;
+}
+
 RGWHandler::~RGWHandler()
 {
 }
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index ee6cc2a..3c3d873 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -13,6 +13,7 @@
 
 #include <limits.h>
 
+#include <memory>
 #include <string>
 #include <set>
 #include <map>
@@ -64,6 +65,7 @@ enum RGWOpType {
   RGW_OP_LIST_MULTIPART,
   RGW_OP_LIST_BUCKET_MULTIPARTS,
   RGW_OP_DELETE_MULTI_OBJ,
+  RGW_OP_BULK_DELETE
 };
 
 /**
@@ -140,6 +142,7 @@ protected:
   bool skip_manifest;
   rgw_obj obj;
   utime_t gc_invalidate_time;
+  bool is_slo;
 
   int init_common();
 public:
@@ -163,6 +166,7 @@ public:
     range_parsed = false;
     skip_manifest = false;
     ret = 0;
+    is_slo = false;
  }
 
   bool prefetch_data();
@@ -173,8 +177,13 @@ public:
   int verify_permission();
   void pre_exec();
   void execute();
-  int read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs);
+  int read_user_manifest_part(rgw_bucket& bucket,
+                              const RGWObjEnt& ent,
+                              RGWAccessControlPolicy *bucket_policy,
+                              off_t start_ofs,
+                              off_t end_ofs);
   int handle_user_manifest(const char *prefix);
+  int handle_slo_manifest(bufferlist& bl);
 
   int get_data_cb(bufferlist& bl, off_t ofs, off_t len);
 
@@ -188,6 +197,87 @@ public:
   virtual bool need_object_expiration() { return false; }
 };
 
+class RGWBulkDelete : public RGWOp {
+public:
+  struct acct_path_t {
+    std::string bucket_name;
+    rgw_obj_key obj_key;
+  };
+
+  struct fail_desc_t {
+    int err;
+    acct_path_t path;
+  };
+
+  class Deleter {
+  protected:
+    unsigned int num_deleted;
+    unsigned int num_unfound;
+    std::list<fail_desc_t> failures;
+
+    RGWRados * const store;
+    req_state * const s;
+
+  public:
+    Deleter(RGWRados * const str, req_state * const s)
+      : num_deleted(0),
+        num_unfound(0),
+        store(str),
+        s(s) {
+    }
+
+    unsigned int get_num_deleted() const {
+      return num_deleted;
+    }
+
+    unsigned int get_num_unfound() const {
+      return num_unfound;
+    }
+
+    const std::list<fail_desc_t> get_failures() const {
+      return failures;
+    }
+
+    bool verify_permission(RGWBucketInfo& binfo,
+                           map<string, bufferlist>& battrs,
+                           rgw_obj& obj,
+                           ACLOwner& bucket_owner /* out */);
+    bool verify_permission(RGWBucketInfo& binfo,
+                           map<string, bufferlist>& battrs);
+    bool delete_single(const acct_path_t& path);
+    bool delete_chunk(const std::list<acct_path_t>& paths);
+  };
+  /* End of Deleter subclass */
+
+  static const size_t MAX_CHUNK_ENTRIES = 1024;
+
+protected:
+  int ret;
+  std::unique_ptr<Deleter> deleter;
+
+public:
+  RGWBulkDelete()
+    : ret(0),
+      deleter(nullptr) {
+  }
+
+  int verify_permission();
+  void pre_exec();
+  void execute();
+
+  virtual int get_data(std::list<acct_path_t>& items,
+                       bool * is_truncated) = 0;
+  virtual void send_response() = 0;
+
+  virtual const string name() { return "bulk_delete"; }
+  virtual RGWOpType get_type() { return RGW_OP_BULK_DELETE; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_DELETE; }
+};
+
+inline ostream& operator<<(ostream& out, const RGWBulkDelete::acct_path_t &o) {
+  return out << o.bucket_name << "/" << o.obj_key;
+}
+
 #define RGW_LIST_BUCKETS_LIMIT_MAX 10000
 
 class RGWListBuckets : public RGWOp {
@@ -419,6 +509,62 @@ public:
   virtual uint32_t op_mask() { return RGW_OP_TYPE_DELETE; }
 };
 
+struct rgw_slo_entry {
+  string path;
+  string etag;
+  uint64_t size_bytes;
+
+  rgw_slo_entry() : size_bytes(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(path, bl);
+    ::encode(etag, bl);
+    ::encode(size_bytes, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+     DECODE_START(1, bl);
+     ::decode(path, bl);
+     ::decode(etag, bl);
+     ::decode(size_bytes, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_slo_entry)
+
+struct RGWSLOInfo {
+  vector<rgw_slo_entry> entries;
+  uint64_t total_size;
+
+  /* in memory only */
+  char *raw_data;
+  int raw_data_len;
+
+  RGWSLOInfo() : raw_data(NULL), raw_data_len(0) {}
+  ~RGWSLOInfo() {
+    free(raw_data);
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(entries, bl);
+    ::encode(total_size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+     DECODE_START(1, bl);
+     ::decode(entries, bl);
+     ::decode(total_size, bl);
+     DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(RGWSLOInfo)
+
 class RGWPutObj : public RGWOp {
 
   friend class RGWPutObjProcessor;
@@ -433,30 +579,30 @@ protected:
   string etag;
   bool chunked_upload;
   RGWAccessControlPolicy policy;
-  const char *obj_manifest;
-  time_t mtime;
-
-  MD5 *user_manifest_parts_hash;
+  const char *dlo_manifest;
+  RGWSLOInfo *slo_info;
 
+  time_t mtime;
   uint64_t olh_epoch;
   string version_id;
 
   time_t delete_at;
 
 public:
-  RGWPutObj() {
-    ret = 0;
-    ofs = 0;
-    supplied_md5_b64 = NULL;
-    supplied_etag = NULL;
-    if_match = NULL;
-    if_nomatch = NULL;
-    chunked_upload = false;
-    obj_manifest = NULL;
-    mtime = 0;
-    user_manifest_parts_hash = NULL;
-    olh_epoch = 0;
-    delete_at = 0;
+  RGWPutObj() : ret(0), ofs(0),
+                supplied_md5_b64(NULL),
+                supplied_etag(NULL),
+                if_match(NULL),
+                if_nomatch(NULL),
+                chunked_upload(0),
+                dlo_manifest(NULL),
+                slo_info(NULL),
+                mtime(0),
+                olh_epoch(0),
+                delete_at(0) {}
+
+  ~RGWPutObj() {
+    delete slo_info;
   }
 
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
@@ -589,10 +735,13 @@ protected:
   RGWAccessControlPolicy policy;
   string placement_rule;
   time_t delete_at;
+  const char *dlo_manifest;
 
 public:
   RGWPutMetadataObject()
-    : ret(0), delete_at(0)
+    : ret(0),
+      delete_at(0),
+      dlo_manifest(NULL)
   {}
 
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
@@ -615,15 +764,24 @@ class RGWDeleteObj : public RGWOp {
 protected:
   int ret;
   bool delete_marker;
+  bool multipart_delete;
   string version_id;
+  std::unique_ptr<RGWBulkDelete::Deleter> deleter;
 
 public:
-  RGWDeleteObj() : ret(0), delete_marker(false) {}
+  RGWDeleteObj()
+    : ret(0),
+      delete_marker(false),
+      multipart_delete(false),
+      deleter(nullptr) {
+  }
 
   int verify_permission();
   void pre_exec();
   void execute();
+  int handle_slo_manifest(bufferlist& bl);
 
+  virtual int get_params() { return 0; };
   virtual void send_response() = 0;
   virtual const string name() { return "delete_obj"; }
   virtual RGWOpType get_type() { return RGW_OP_DELETE_OBJ; }
@@ -647,10 +805,10 @@ protected:
   time_t *unmod_ptr;
   int ret;
   map<string, bufferlist> attrs;
-  string src_bucket_name;
+  string src_tenant_name, src_bucket_name;
   rgw_bucket src_bucket;
   rgw_obj_key src_object;
-  string dest_bucket_name;
+  string dest_tenant_name, dest_bucket_name;
   rgw_bucket dest_bucket;
   string dest_object;
   time_t src_mtime;
@@ -694,7 +852,9 @@ public:
     delete_at = 0;
   }
 
-  static bool parse_copy_location(const string& src, string& bucket_name, rgw_obj_key& object);
+  static bool parse_copy_location(const string& src,
+                                  string& bucket_name,
+                                  rgw_obj_key& object);
 
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
     RGWOp::init(store, s, h);
@@ -1084,13 +1244,13 @@ public:
   virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; }
 };
 
+
 class RGWDeleteMultiObj : public RGWOp {
 protected:
   int ret;
   int max_to_delete;
   size_t len;
   char *data;
-  string bucket_name;
   rgw_bucket bucket;
   bool quiet;
   bool status_dumped;
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
index d29cdd2..03c152d 100644
--- a/src/rgw/rgw_quota.cc
+++ b/src/rgw/rgw_quota.cc
@@ -58,14 +58,14 @@ protected:
     }
   };
 
-  virtual int fetch_stats_from_storage(const string& user, rgw_bucket& bucket, RGWStorageStats& stats) = 0;
+  virtual int fetch_stats_from_storage(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats) = 0;
 
-  virtual bool map_find(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0;
+  virtual bool map_find(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0;
 
-  virtual bool map_find_and_update(const string& user, rgw_bucket& bucket, typename lru_map<T, RGWQuotaCacheStats>::UpdateContext *ctx) = 0;
-  virtual void map_add(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0;
+  virtual bool map_find_and_update(const rgw_user& user, rgw_bucket& bucket, typename lru_map<T, RGWQuotaCacheStats>::UpdateContext *ctx) = 0;
+  virtual void map_add(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0;
 
-  virtual void data_modified(const string& user, rgw_bucket& bucket) {}
+  virtual void data_modified(const rgw_user& user, rgw_bucket& bucket) {}
 public:
   RGWQuotaCache(RGWRados *_store, int size) : store(_store), stats_map(size) {
     async_refcount = new RefCountedWaitObject;
@@ -74,14 +74,14 @@ public:
     async_refcount->put_wait(); /* wait for all pending async requests to complete */
   }
 
-  int get_stats(const string& user, rgw_bucket& bucket, RGWStorageStats& stats, RGWQuotaInfo& quota);
-  void adjust_stats(const string& user, rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes);
+  int get_stats(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats, RGWQuotaInfo& quota);
+  void adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes);
 
   virtual bool can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats& stats);
 
-  void set_stats(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats);
-  int async_refresh(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs);
-  void async_refresh_response(const string& user, rgw_bucket& bucket, RGWStorageStats& stats);
+  void set_stats(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats);
+  int async_refresh(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs);
+  void async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats);
 
   class AsyncRefreshHandler {
   protected:
@@ -95,7 +95,7 @@ public:
     virtual void drop_reference() = 0;
   };
 
-  virtual AsyncRefreshHandler *allocate_refresh_handler(const string& user, rgw_bucket& bucket) = 0;
+  virtual AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, rgw_bucket& bucket) = 0;
 };
 
 template<class T>
@@ -129,7 +129,7 @@ bool RGWQuotaCache<T>::can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats
 }
 
 template<class T>
-int RGWQuotaCache<T>::async_refresh(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs)
+int RGWQuotaCache<T>::async_refresh(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs)
 {
   /* protect against multiple updates */
   StatsAsyncTestSet test_update;
@@ -154,7 +154,7 @@ int RGWQuotaCache<T>::async_refresh(const string& user, rgw_bucket& bucket, RGWQ
 }
 
 template<class T>
-void RGWQuotaCache<T>::async_refresh_response(const string& user, rgw_bucket& bucket, RGWStorageStats& stats)
+void RGWQuotaCache<T>::async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats)
 {
   ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl;
 
@@ -168,7 +168,7 @@ void RGWQuotaCache<T>::async_refresh_response(const string& user, rgw_bucket& bu
 }
 
 template<class T>
-void RGWQuotaCache<T>::set_stats(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats)
+void RGWQuotaCache<T>::set_stats(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats)
 {
   qs.stats = stats;
   qs.expiration = ceph_clock_now(store->ctx());
@@ -180,7 +180,7 @@ void RGWQuotaCache<T>::set_stats(const string& user, rgw_bucket& bucket, RGWQuot
 }
 
 template<class T>
-int RGWQuotaCache<T>::get_stats(const string& user, rgw_bucket& bucket, RGWStorageStats& stats, RGWQuotaInfo& quota) {
+int RGWQuotaCache<T>::get_stats(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats, RGWQuotaInfo& quota) {
   RGWQuotaCacheStats qs;
   utime_t now = ceph_clock_now(store->ctx());
   if (map_find(user, bucket, qs)) {
@@ -231,7 +231,7 @@ public:
 
 
 template<class T>
-void RGWQuotaCache<T>::adjust_stats(const string& user, rgw_bucket& bucket, int objs_delta,
+void RGWQuotaCache<T>::adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta,
                                  uint64_t added_bytes, uint64_t removed_bytes)
 {
   RGWQuotaStatsUpdate<T> update(objs_delta, added_bytes, removed_bytes);
@@ -242,10 +242,10 @@ void RGWQuotaCache<T>::adjust_stats(const string& user, rgw_bucket& bucket, int
 
 class BucketAsyncRefreshHandler : public RGWQuotaCache<rgw_bucket>::AsyncRefreshHandler,
                                   public RGWGetBucketStats_CB {
-  string user;
+  rgw_user user;
 public:
   BucketAsyncRefreshHandler(RGWRados *_store, RGWQuotaCache<rgw_bucket> *_cache,
-                            const string& _user, rgw_bucket& _bucket) :
+                            const rgw_user& _user, rgw_bucket& _bucket) :
                                       RGWQuotaCache<rgw_bucket>::AsyncRefreshHandler(_store, _cache),
                                       RGWGetBucketStats_CB(_bucket), user(_user) {}
 
@@ -291,30 +291,30 @@ void BucketAsyncRefreshHandler::handle_response(int r)
 
 class RGWBucketStatsCache : public RGWQuotaCache<rgw_bucket> {
 protected:
-  bool map_find(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
+  bool map_find(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
     return stats_map.find(bucket, qs);
   }
 
-  bool map_find_and_update(const string& user, rgw_bucket& bucket, lru_map<rgw_bucket, RGWQuotaCacheStats>::UpdateContext *ctx) {
+  bool map_find_and_update(const rgw_user& user, rgw_bucket& bucket, lru_map<rgw_bucket, RGWQuotaCacheStats>::UpdateContext *ctx) {
     return stats_map.find_and_update(bucket, NULL, ctx);
   }
 
-  void map_add(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
+  void map_add(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
     stats_map.add(bucket, qs);
   }
 
-  int fetch_stats_from_storage(const string& user, rgw_bucket& bucket, RGWStorageStats& stats);
+  int fetch_stats_from_storage(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats);
 
 public:
   RGWBucketStatsCache(RGWRados *_store) : RGWQuotaCache<rgw_bucket>(_store, _store->ctx()->_conf->rgw_bucket_quota_cache_size) {
   }
 
-  AsyncRefreshHandler *allocate_refresh_handler(const string& user, rgw_bucket& bucket) {
+  AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, rgw_bucket& bucket) {
     return new BucketAsyncRefreshHandler(store, this, user, bucket);
   }
 };
 
-int RGWBucketStatsCache::fetch_stats_from_storage(const string& user, rgw_bucket& bucket, RGWStorageStats& stats)
+int RGWBucketStatsCache::fetch_stats_from_storage(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats)
 {
   RGWBucketInfo bucket_info;
 
@@ -341,13 +341,13 @@ int RGWBucketStatsCache::fetch_stats_from_storage(const string& user, rgw_bucket
   return 0;
 }
 
-class UserAsyncRefreshHandler : public RGWQuotaCache<string>::AsyncRefreshHandler,
+class UserAsyncRefreshHandler : public RGWQuotaCache<rgw_user>::AsyncRefreshHandler,
                                 public RGWGetUserStats_CB {
   rgw_bucket bucket;
 public:
-  UserAsyncRefreshHandler(RGWRados *_store, RGWQuotaCache<string> *_cache,
-                          const string& _user, rgw_bucket& _bucket) :
-                          RGWQuotaCache<string>::AsyncRefreshHandler(_store, _cache),
+  UserAsyncRefreshHandler(RGWRados *_store, RGWQuotaCache<rgw_user> *_cache,
+                          const rgw_user& _user, rgw_bucket& _bucket) :
+                          RGWQuotaCache<rgw_user>::AsyncRefreshHandler(_store, _cache),
                           RGWGetUserStats_CB(_user),
                           bucket(_bucket) {}
 
@@ -380,10 +380,10 @@ void UserAsyncRefreshHandler::handle_response(int r)
   cache->async_refresh_response(user, bucket, stats);
 }
 
-class RGWUserStatsCache : public RGWQuotaCache<string> {
+class RGWUserStatsCache : public RGWQuotaCache<rgw_user> {
   atomic_t down_flag;
   RWLock rwlock;
-  map<rgw_bucket, string> modified_buckets;
+  map<rgw_bucket, rgw_user> modified_buckets;
 
   /* thread, sync recent modified buckets info */
   class BucketsSyncThread : public Thread {
@@ -399,13 +399,13 @@ class RGWUserStatsCache : public RGWQuotaCache<string> {
     void *entry() {
       ldout(cct, 20) << "BucketsSyncThread: start" << dendl;
       do {
-        map<rgw_bucket, string> buckets;
+        map<rgw_bucket, rgw_user> buckets;
 
         stats->swap_modified_buckets(buckets);
 
-        for (map<rgw_bucket, string>::iterator iter = buckets.begin(); iter != buckets.end(); ++iter) {
+        for (map<rgw_bucket, rgw_user>::iterator iter = buckets.begin(); iter != buckets.end(); ++iter) {
           rgw_bucket bucket = iter->first;
-          string& user = iter->second;
+          rgw_user& user = iter->second;
           ldout(cct, 20) << "BucketsSyncThread: sync user=" << user << " bucket=" << bucket << dendl;
           int r = stats->sync_bucket(user, bucket);
           if (r < 0) {
@@ -474,26 +474,26 @@ class RGWUserStatsCache : public RGWQuotaCache<string> {
   BucketsSyncThread *buckets_sync_thread;
   UserSyncThread *user_sync_thread;
 protected:
-  bool map_find(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
+  bool map_find(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
     return stats_map.find(user, qs);
   }
 
-  bool map_find_and_update(const string& user, rgw_bucket& bucket, lru_map<string, RGWQuotaCacheStats>::UpdateContext *ctx) {
+  bool map_find_and_update(const rgw_user& user, rgw_bucket& bucket, lru_map<rgw_user, RGWQuotaCacheStats>::UpdateContext *ctx) {
     return stats_map.find_and_update(user, NULL, ctx);
   }
 
-  void map_add(const string& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
+  void map_add(const rgw_user& user, rgw_bucket& bucket, RGWQuotaCacheStats& qs) {
     stats_map.add(user, qs);
   }
 
-  int fetch_stats_from_storage(const string& user, rgw_bucket& bucket, RGWStorageStats& stats);
-  int sync_bucket(const string& user, rgw_bucket& bucket);
-  int sync_user(const string& user);
+  int fetch_stats_from_storage(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats);
+  int sync_bucket(const rgw_user& rgw_user, rgw_bucket& bucket);
+  int sync_user(const rgw_user& user);
   int sync_all_users();
 
-  void data_modified(const string& user, rgw_bucket& bucket);
+  void data_modified(const rgw_user& user, rgw_bucket& bucket);
 
-  void swap_modified_buckets(map<rgw_bucket, string>& out) {
+  void swap_modified_buckets(map<rgw_bucket, rgw_user>& out) {
     rwlock.get_write();
     modified_buckets.swap(out);
     rwlock.unlock();
@@ -512,7 +512,7 @@ protected:
   }
 
 public:
-  RGWUserStatsCache(RGWRados *_store, bool quota_threads) : RGWQuotaCache<string>(_store, _store->ctx()->_conf->rgw_bucket_quota_cache_size),
+  RGWUserStatsCache(RGWRados *_store, bool quota_threads) : RGWQuotaCache<rgw_user>(_store, _store->ctx()->_conf->rgw_bucket_quota_cache_size),
                                         rwlock("RGWUserStatsCache::rwlock") {
     if (quota_threads) {
       buckets_sync_thread = new BucketsSyncThread(store->ctx(), this);
@@ -528,7 +528,7 @@ public:
     stop();
   }
 
-  AsyncRefreshHandler *allocate_refresh_handler(const string& user, rgw_bucket& bucket) {
+  AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, rgw_bucket& bucket) {
     return new UserAsyncRefreshHandler(store, this, user, bucket);
   }
 
@@ -552,7 +552,7 @@ public:
   }
 };
 
-int RGWUserStatsCache::fetch_stats_from_storage(const string& user, rgw_bucket& bucket, RGWStorageStats& stats)
+int RGWUserStatsCache::fetch_stats_from_storage(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats)
 {
   int r = store->get_user_stats(user, stats);
   if (r < 0) {
@@ -563,7 +563,7 @@ int RGWUserStatsCache::fetch_stats_from_storage(const string& user, rgw_bucket&
   return 0;
 }
 
-int RGWUserStatsCache::sync_bucket(const string& user, rgw_bucket& bucket)
+int RGWUserStatsCache::sync_bucket(const rgw_user& user, rgw_bucket& bucket)
 {
   int r = rgw_bucket_sync_user_stats(store, user, bucket);
   if (r < 0) {
@@ -574,10 +574,11 @@ int RGWUserStatsCache::sync_bucket(const string& user, rgw_bucket& bucket)
   return 0;
 }
 
-int RGWUserStatsCache::sync_user(const string& user)
+int RGWUserStatsCache::sync_user(const rgw_user& user)
 {
   cls_user_header header;
-  int ret = store->cls_user_get_header(user, &header);
+  string user_str = user.to_str();
+  int ret = store->cls_user_get_header(user_str, &header);
   if (ret < 0) {
     ldout(store->ctx(), 0) << "ERROR: can't read user header: ret=" << ret << dendl;
     return ret;
@@ -627,7 +628,7 @@ int RGWUserStatsCache::sync_all_users()
     for (list<string>::iterator iter = keys.begin();
          iter != keys.end() && !going_down(); 
          ++iter) {
-      string& user = *iter;
+      rgw_user user(*iter);
       ldout(store->ctx(), 20) << "RGWUserStatsCache: sync user=" << user << dendl;
       int ret = sync_user(user);
       if (ret < 0) {
@@ -645,7 +646,7 @@ done:
   return ret;
 }
 
-void RGWUserStatsCache::data_modified(const string& user, rgw_bucket& bucket)
+void RGWUserStatsCache::data_modified(const rgw_user& user, rgw_bucket& bucket)
 {
   /* racy, but it's ok */
   rwlock.get_read();
@@ -710,7 +711,7 @@ public:
       def_user_quota.enabled = true;
     }
   }
-  virtual int check_quota(const string& user, rgw_bucket& bucket,
+  virtual int check_quota(const rgw_user& user, rgw_bucket& bucket,
                           RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
 			  uint64_t num_objs, uint64_t size) {
 
@@ -764,7 +765,7 @@ public:
     return 0;
   }
 
-  virtual void update_stats(const string& user, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) {
+  virtual void update_stats(const rgw_user& user, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) {
     bucket_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes);
     user_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes);
   }
diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h
index c74cdce..d951288 100644
--- a/src/rgw/rgw_quota.h
+++ b/src/rgw/rgw_quota.h
@@ -62,11 +62,11 @@ public:
   RGWQuotaHandler() {}
   virtual ~RGWQuotaHandler() {
   }
-  virtual int check_quota(const string& bucket_owner, rgw_bucket& bucket,
+  virtual int check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
                           RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
 			  uint64_t num_objs, uint64_t size) = 0;
 
-  virtual void update_stats(const string& bucket_owner, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0;
+  virtual void update_stats(const rgw_user& bucket_owner, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0;
 
   static RGWQuotaHandler *generate_handler(RGWRados *store, bool quota_threads);
   static void free_handler(RGWQuotaHandler *handler);
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 6e93bee..3e38e9e 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -1305,7 +1305,6 @@ class RGWWatcher : public librados::WatchCtx2 {
         watcher->reinit();
       }
   };
-  shared_ptr<C_ReinitWatch> reinit_watch;
 public:
   RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
   void handle_notify(uint64_t notify_id,
@@ -1326,8 +1325,7 @@ public:
     lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
 			<< " err " << cpp_strerror(err) << dendl;
     rados->remove_watcher(index);
-    reinit_watch.reset(new C_ReinitWatch(this));
-    rados->schedule_context(reinit_watch.get());
+    rados->schedule_context(new C_ReinitWatch(this));
   }
 
   void reinit() {
@@ -1881,7 +1879,7 @@ void RGWRados::pick_control_oid(const string& key, string& notify_oid)
   notify_oid.append(buf);
 }
 
-int RGWRados::open_bucket_pool_ctx(const string& bucket_name, const string& pool, librados::IoCtx&  io_ctx)
+int RGWRados::open_bucket_pool_ctx(const string& pool, librados::IoCtx&  io_ctx)
 {
   librados::Rados *rad = get_rados_handle();
   int r = rad->ioctx_create(pool.c_str(), io_ctx);
@@ -1902,7 +1900,7 @@ int RGWRados::open_bucket_pool_ctx(const string& bucket_name, const string& pool
 
 int RGWRados::open_bucket_data_ctx(rgw_bucket& bucket, librados::IoCtx& data_ctx)
 {
-  int r = open_bucket_pool_ctx(bucket.name, bucket.data_pool, data_ctx);
+  int r = open_bucket_pool_ctx(bucket.data_pool, data_ctx);
   if (r < 0)
     return r;
 
@@ -1912,7 +1910,7 @@ int RGWRados::open_bucket_data_ctx(rgw_bucket& bucket, librados::IoCtx& data_ctx
 int RGWRados::open_bucket_data_extra_ctx(rgw_bucket& bucket, librados::IoCtx& data_ctx)
 {
   string& pool = (!bucket.data_extra_pool.empty() ? bucket.data_extra_pool : bucket.data_pool);
-  int r = open_bucket_pool_ctx(bucket.name, pool, data_ctx);
+  int r = open_bucket_pool_ctx(pool, data_ctx);
   if (r < 0)
     return r;
 
@@ -1930,7 +1928,7 @@ void RGWRados::build_bucket_index_marker(const string& shard_id_str, const strin
 
 int RGWRados::open_bucket_index_ctx(rgw_bucket& bucket, librados::IoCtx& index_ctx)
 {
-  int r = open_bucket_pool_ctx(bucket.name, bucket.index_pool, index_ctx);
+  int r = open_bucket_pool_ctx(bucket.index_pool, index_ctx);
   if (r < 0)
     return r;
 
@@ -2171,15 +2169,16 @@ int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
   return 0;
 }
 
-int RGWRados::read_usage(string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
                          bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
 {
   uint32_t num = max_entries;
   string hash, first_hash;
-  usage_log_hash(cct, user, first_hash, 0);
+  string user_str = user.to_str();
+  usage_log_hash(cct, user_str, first_hash, 0);
 
   if (usage_iter.index) {
-    usage_log_hash(cct, user, hash, usage_iter.index);
+    usage_log_hash(cct, user_str, hash, usage_iter.index);
   } else {
     hash = first_hash;
   }
@@ -2190,7 +2189,7 @@ int RGWRados::read_usage(string& user, uint64_t start_epoch, uint64_t end_epoch,
     map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
     map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
 
-    int ret =  cls_obj_usage_log_read(hash, user, start_epoch, end_epoch, num,
+    int ret =  cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
                                     usage_iter.read_iter, ret_usage, is_truncated);
     if (ret == -ENOENT)
       goto next;
@@ -2207,22 +2206,23 @@ int RGWRados::read_usage(string& user, uint64_t start_epoch, uint64_t end_epoch,
 next:
     if (!*is_truncated) {
       usage_iter.read_iter.clear();
-      usage_log_hash(cct, user, hash, ++usage_iter.index);
+      usage_log_hash(cct, user_str, hash, ++usage_iter.index);
     }
   } while (num && !*is_truncated && hash != first_hash);
   return 0;
 }
 
-int RGWRados::trim_usage(string& user, uint64_t start_epoch, uint64_t end_epoch)
+int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
 {
   uint32_t index = 0;
   string hash, first_hash;
-  usage_log_hash(cct, user, first_hash, index);
+  string user_str = user.to_str();
+  usage_log_hash(cct, user_str, first_hash, index);
 
   hash = first_hash;
 
   do {
-    int ret =  cls_obj_usage_log_trim(hash, user, start_epoch, end_epoch);
+    int ret =  cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
     if (ret == -ENOENT)
       goto next;
 
@@ -2230,7 +2230,7 @@ int RGWRados::trim_usage(string& user, uint64_t start_epoch, uint64_t end_epoch)
       return ret;
 
 next:
-    usage_log_hash(cct, user, hash, ++index);
+    usage_log_hash(cct, user_str, hash, ++index);
   } while (hash != first_hash);
 
   return 0;
@@ -2394,21 +2394,25 @@ int RGWRados::objexp_key_shard(const rgw_obj_key& key)
   return sid % num_shards;
 }
 
-static string objexp_hint_get_keyext(const string& bucket_name,
+static string objexp_hint_get_keyext(const string& tenant_name,
+                                     const string& bucket_name,
                                      const string& bucket_id,
                                      const rgw_obj_key& obj_key)
 {
-  return bucket_name + ":" + bucket_id + ":" + obj_key.name + ":" + obj_key.instance;
+  return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
+      ":" + obj_key.name + ":" + obj_key.instance;
 }
 
 int RGWRados::objexp_hint_add(const utime_t& delete_at,
+                              const string& tenant_name,
                               const string& bucket_name,
                               const string& bucket_id,
                               const rgw_obj_key& obj_key)
 {
-  const string keyext = objexp_hint_get_keyext(bucket_name,
+  const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
           bucket_id, obj_key);
   objexp_hint_entry he = {
+      .tenant = tenant_name,
       .bucket_name = bucket_name,
       .bucket_id = bucket_id,
       .obj_key = obj_key,
@@ -2805,7 +2809,9 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
   string selected_placement_rule;
   for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
     int ret = 0;
-    ret = select_bucket_placement(owner, region_name, placement_rule, bucket.name, bucket, &selected_placement_rule);
+    ret = select_bucket_placement(owner, region_name, placement_rule,
+                                  bucket.tenant, bucket.name, bucket,
+                                  &selected_placement_rule);
     if (ret < 0)
       return ret;
     bufferlist bl;
@@ -2832,9 +2838,6 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
       bucket.bucket_id = pmaster_bucket->bucket_id;
     }
 
-    string dir_oid =  dir_oid_prefix;
-    dir_oid.append(bucket.marker);
-
     r = init_bucket_index(bucket, bucket_index_max_shards);
     if (r < 0)
       return r;
@@ -2864,7 +2867,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
       RGWObjVersionTracker instance_ver = info.objv_tracker;
       info.objv_tracker.clear();
       RGWObjectCtx obj_ctx(this);
-      r = get_bucket_info(obj_ctx, bucket.name, info, NULL, NULL);
+      r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
       if (r < 0) {
         if (r == -ENOENT) {
           continue;
@@ -2906,7 +2909,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
 }
 
 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& region_name, const string& request_rule,
-                                         const string& bucket_name, rgw_bucket& bucket, string *pselected_rule)
+                                         const string& tenant_name, const string& bucket_name, rgw_bucket& bucket, string *pselected_rule)
 {
   /* first check that rule exists within the specific region */
   map<string, RGWRegion>::iterator riter = region_map.regions.find(region_name);
@@ -2948,18 +2951,19 @@ int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& r
   if (pselected_rule)
     *pselected_rule = rule;
   
-  return set_bucket_location_by_rule(rule, bucket_name, bucket);
+  return set_bucket_location_by_rule(rule, tenant_name, bucket_name, bucket);
 }
 
-int RGWRados::set_bucket_location_by_rule(const string& location_rule, const std::string& bucket_name, rgw_bucket& bucket)
+int RGWRados::set_bucket_location_by_rule(const string& location_rule, const string& tenant_name, const string& bucket_name, rgw_bucket& bucket)
 {
+  bucket.tenant = tenant_name;
   bucket.name = bucket_name;
 
   if (location_rule.empty()) {
     /* we can only reach here if we're trying to set a bucket location from a bucket
      * created on a different zone, using a legacy / default pool configuration
      */
-    return select_legacy_bucket_placement(bucket_name, bucket);
+    return select_legacy_bucket_placement(tenant_name, bucket_name, bucket);
   }
 
   /*
@@ -2987,23 +2991,24 @@ int RGWRados::set_bucket_location_by_rule(const string& location_rule, const std
   bucket.index_pool = placement_info.index_pool;
 
   return 0;
-
 }
 
 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& region_name, const string& placement_rule,
-                                      const string& bucket_name, rgw_bucket& bucket, string *pselected_rule)
+                                      const string& tenant_name, const string& bucket_name, rgw_bucket& bucket,
+                                      string *pselected_rule)
 {
   if (!zone.placement_pools.empty()) {
-    return select_new_bucket_location(user_info, region_name, placement_rule, bucket_name, bucket, pselected_rule);
+    return select_new_bucket_location(user_info, region_name, placement_rule,
+                                      tenant_name, bucket_name, bucket, pselected_rule);
   }
 
   if (pselected_rule)
     pselected_rule->clear();
 
-  return select_legacy_bucket_placement(bucket_name, bucket);
+  return select_legacy_bucket_placement(tenant_name, bucket_name, bucket);
 }
 
-int RGWRados::select_legacy_bucket_placement(const string& bucket_name, rgw_bucket& bucket)
+int RGWRados::select_legacy_bucket_placement(const string& tenant_name, const string& bucket_name, rgw_bucket& bucket)
 {
   bufferlist map_bl;
   map<string, bufferlist> m;
@@ -3076,7 +3081,6 @@ read_omap:
   }
   bucket.data_pool = pool_name;
   bucket.index_pool = pool_name;
-  bucket.name = bucket_name;
 
   return 0;
 
@@ -3638,7 +3642,8 @@ int RGWRados::Object::Write::write_meta(uint64_t size,
     rgw_obj_key obj_key;
     obj.get_index_key(&obj_key);
 
-    r = store->objexp_hint_add(utime_t(meta.delete_at, 0), bucket.name, bucket.bucket_id, obj_key);
+    r = store->objexp_hint_add(utime_t(meta.delete_at, 0),
+            bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
     if (r < 0) {
       ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
       /* ignoring error, nothing we can do at this point */
@@ -3950,7 +3955,7 @@ int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
 }
 
 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
-               const string& user_id,
+               const rgw_user& user_id,
                const string& client_id,
                const string& op_id,
                req_info *info,
@@ -4108,7 +4113,7 @@ set_err_state:
 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
                                       map<string, bufferlist>& src_attrs,
                                       RGWRados::Object::Read& read_op,
-                                      const string& user_id,
+                                      const rgw_user& user_id,
                                       rgw_obj& dest_obj,
                                       time_t *mtime)
 {
@@ -4147,7 +4152,7 @@ int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
  * Returns: 0 on success, -ERR# otherwise.
  */
 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
-               const string& user_id,
+               const rgw_user& user_id,
                const string& client_id,
                const string& op_id,
                req_info *info,
@@ -4538,7 +4543,7 @@ int RGWRados::delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_track
     }
   } while (is_truncated);
 
-  r = rgw_bucket_delete_bucket_obj(this, bucket.name, objv_tracker);
+  r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
   if (r < 0)
     return r;
 
@@ -4561,7 +4566,7 @@ int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
   RGWBucketInfo info;
   map<string, bufferlist> attrs;
   RGWObjectCtx obj_ctx(this);
-  int r = get_bucket_info(obj_ctx, bucket.name, info, NULL, &attrs);
+  int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
   if (r < 0) {
     ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
     return r;
@@ -4595,7 +4600,7 @@ int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
     RGWBucketInfo info;
     map<string, bufferlist> attrs;
     RGWObjectCtx obj_ctx(this);
-    int r = get_bucket_info(obj_ctx, bucket.name, info, NULL, &attrs);
+    int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
     if (r < 0) {
       ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
       ret = r;
@@ -4621,7 +4626,7 @@ int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
 {
   RGWBucketInfo bucket_info;
   RGWObjectCtx obj_ctx(this);
-  int ret = get_bucket_info(obj_ctx, bucket.name, bucket_info, NULL);
+  int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
   if (ret < 0) {
     return ret;
   }
@@ -4901,7 +4906,7 @@ int RGWRados::Object::Delete::delete_obj()
 
       struct rgw_bucket_dir_entry_meta meta;
 
-      meta.owner = params.obj_owner.get_id();
+      meta.owner = params.obj_owner.get_id().to_str();
       meta.owner_display_name = params.obj_owner.get_display_name();
       meta.mtime = ceph_clock_now(store->ctx());
 
@@ -5551,7 +5556,7 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj,
         rgw_obj_key obj_key;
         obj.get_index_key(&obj_key);
 
-        objexp_hint_add(ts, bucket.name, bucket.bucket_id, obj_key);
+        objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
       } catch (buffer::error& err) {
 	ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
       }
@@ -5641,6 +5646,7 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj,
  *          (if get_data==true) length of read data,
  *          (if get_data==false) length of the object
  */
+// P3 XXX get_data is not seen used anywhere.
 int RGWRados::Object::Read::prepare(int64_t *pofs, int64_t *pend)
 {
   RGWRados *store = source->get_store();
@@ -7432,10 +7438,12 @@ public:
   }
 };
 
-int RGWRados::get_user_stats(const string& user, RGWStorageStats& stats)
+int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
 {
+  string user_str = user.to_str();
+
   cls_user_header header;
-  int r = cls_user_get_header(user, &header);
+  int r = cls_user_get_header(user_str, &header);
   if (r < 0)
     return r;
 
@@ -7448,10 +7456,12 @@ int RGWRados::get_user_stats(const string& user, RGWStorageStats& stats)
   return 0;
 }
 
-int RGWRados::get_user_stats_async(const string& user, RGWGetUserStats_CB *ctx)
+int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
 {
+  string user_str = user.to_str();
+
   RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
-  int r = cls_user_get_header_async(user, get_ctx);
+  int r = cls_user_get_header_async(user_str, get_ctx);
   if (r < 0) {
     ctx->put();
     delete get_ctx;
@@ -7463,7 +7473,11 @@ int RGWRados::get_user_stats_async(const string& user, RGWGetUserStats_CB *ctx)
 
 void RGWRados::get_bucket_instance_entry(rgw_bucket& bucket, string& entry)
 {
-  entry = bucket.name + ":" + bucket.bucket_id;
+  if (bucket.tenant.empty()) {
+    entry = bucket.name + ":" + bucket.bucket_id;
+  } else {
+    entry = bucket.tenant + ":" + bucket.name + ":" + bucket.bucket_id;
+  }
 }
 
 void RGWRados::get_bucket_meta_oid(rgw_bucket& bucket, string& oid)
@@ -7533,7 +7547,9 @@ int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, string& oid, R
   return 0;
 }
 
-int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx, const string& bucket_name,
+int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
+                                         const string& tenant_name,
+                                         const string& bucket_name,
                                          RGWBucketEntryPoint& entry_point,
                                          RGWObjVersionTracker *objv_tracker,
                                          time_t *pmtime,
@@ -7541,8 +7557,10 @@ int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx, const string& bu
                                          rgw_cache_entry_info *cache_info)
 {
   bufferlist bl;
+  string bucket_entry;
 
-  int ret = rgw_get_system_obj(this, obj_ctx, zone.domain_root, bucket_name, bl, objv_tracker, pmtime, pattrs, cache_info);
+  rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
+  int ret = rgw_get_system_obj(this, obj_ctx, zone.domain_root, bucket_entry, bl, objv_tracker, pmtime, pattrs, cache_info);
   if (ret < 0) {
     return ret;
   }
@@ -7557,7 +7575,9 @@ int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx, const string& bu
   return 0;
 }
 
-int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx, string& bucket_name)
+int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
+                                      const string& tenant_name,
+                                      const string& bucket_name)
 {
   RGWBucketEntryPoint entry_point;
   time_t ep_mtime;
@@ -7567,7 +7587,7 @@ int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx, string& bucket_name
 
   ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
 
-  int ret = get_bucket_entrypoint_info(obj_ctx, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
+  int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
   if (ret < 0) {
     ldout(cct, 0) << "ERROR: get_bucket_entrypont_info() returned " << ret << " bucket=" << bucket_name << dendl;
     return ret;
@@ -7592,11 +7612,15 @@ int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx, string& bucket_name
   return 0;
 }
 
-int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, const string& bucket_name, RGWBucketInfo& info,
+int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
+                              const string& tenant, const string& bucket_name, RGWBucketInfo& info,
                               time_t *pmtime, map<string, bufferlist> *pattrs)
 {
   bucket_info_entry e;
-  if (binfo_cache.find(bucket_name, &e)) {
+  string bucket_entry;
+  rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
+
+  if (binfo_cache.find(bucket_entry, &e)) {
     info = e.info;
     if (pattrs)
       *pattrs = e.attrs;
@@ -7611,15 +7635,18 @@ int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, const string& bucket_name,
   time_t ep_mtime;
   RGWObjVersionTracker ot;
   rgw_cache_entry_info entry_cache_info;
-  int ret = get_bucket_entrypoint_info(obj_ctx, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
+  int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
   if (ret < 0) {
-    info.bucket.name = bucket_name; /* only init this field */
+    /* only init these fields */
+    info.bucket.tenant = tenant;
+    info.bucket.name = bucket_name;
     return ret;
   }
 
   if (entry_point.has_bucket_info) {
     info = entry_point.old_bucket_info;
     info.bucket.oid = bucket_name;
+    info.bucket.tenant = tenant;
     info.ep_objv = ot.read_version;
     ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
     return 0;
@@ -7648,7 +7675,9 @@ int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, const string& bucket_name,
   e.info.ep_objv = ot.read_version;
   info = e.info;
   if (ret < 0) {
+    info.bucket.tenant = tenant;
     info.bucket.name = bucket_name;
+    // XXX and why return anything in case of an error anyway?
     return ret;
   }
 
@@ -7663,20 +7692,22 @@ int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, const string& bucket_name,
 
 
   /* chain to both bucket entry point and bucket instance */
-  if (!binfo_cache.put(this, bucket_name, &e, cache_info_entries)) {
+  if (!binfo_cache.put(this, bucket_entry, &e, cache_info_entries)) {
     ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
   }
 
   return 0;
 }
 
-int RGWRados::put_bucket_entrypoint_info(const string& bucket_name, RGWBucketEntryPoint& entry_point,
+int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
                                          bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime,
                                          map<string, bufferlist> *pattrs)
 {
   bufferlist epbl;
   ::encode(entry_point, epbl);
-  return rgw_bucket_store_info(this, bucket_name, epbl, exclusive, pattrs, &objv_tracker, mtime);
+  string bucket_entry;
+  rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
+  return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
 }
 
 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
@@ -7721,7 +7752,7 @@ int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t
       *pep_objv = ot.write_version;
     }
   }
-  ret = put_bucket_entrypoint_info(info.bucket.name, entry_point, exclusive, ot, mtime, NULL); 
+  ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL); 
   if (ret < 0)
     return ret;
 
@@ -8220,7 +8251,7 @@ int RGWRados::cls_obj_complete_op(BucketShard& bs, RGWModifyOp op, string& tag,
   dir_meta.accounted_size = ent.size;
   dir_meta.mtime = utime_t(ent.mtime, 0);
   dir_meta.etag = ent.etag;
-  dir_meta.owner = ent.owner;
+  dir_meta.owner = ent.owner.to_str();
   dir_meta.owner_display_name = ent.owner_display_name;
   dir_meta.content_type = ent.content_type;
   dir_meta.category = category;
@@ -8580,7 +8611,7 @@ int RGWRados::check_disk_state(librados::IoCtx io_ctx,
   list_state.meta.content_type = content_type;
   if (astate->obj_tag.length() > 0)
     list_state.tag = astate->obj_tag.c_str();
-  list_state.meta.owner = owner.get_id();
+  list_state.meta.owner = owner.get_id().to_str();
   list_state.meta.owner_display_name = owner.get_display_name();
 
   list_state.exists = true;
@@ -8779,7 +8810,7 @@ int RGWRados::cls_user_update_buckets(rgw_obj& obj, list<cls_user_bucket_entry>&
   return 0;
 }
 
-int RGWRados::complete_sync_user_stats(const string& user_id)
+int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
 {
   string buckets_obj_id;
   rgw_get_buckets_obj(user_id, buckets_obj_id);
@@ -8831,7 +8862,7 @@ int RGWRados::cls_user_remove_bucket(rgw_obj& obj, const cls_user_bucket& bucket
   return 0;
 }
 
-int RGWRados::check_quota(const string& bucket_owner, rgw_bucket& bucket,
+int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
                           RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
 {
   return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 27787ec..67adc60 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -51,7 +51,7 @@ static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, rgw_bucket& bu
   prepend_bucket_marker(bucket, obj.get_object(), oid);
   const string& loc = obj.get_loc();
   if (!loc.empty()) {
-    prepend_bucket_marker(bucket, obj.get_loc(), locator);
+    prepend_bucket_marker(bucket, obj.get_loc(), locator); // XXX get_loc twice
   } else {
     locator.clear();
   }
@@ -996,26 +996,34 @@ struct RGWRegionMap {
 WRITE_CLASS_ENCODER(RGWRegionMap)
 
 struct objexp_hint_entry {
+  string tenant;
   string bucket_name;
   string bucket_id;
   rgw_obj_key obj_key;
   utime_t exp_time;
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(bucket_name, bl);
     ::encode(bucket_id, bl);
     ::encode(obj_key, bl);
     ::encode(exp_time, bl);
+    ::encode(tenant, bl);
     ENCODE_FINISH(bl);
   }
 
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
+    // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
+    DECODE_START(2, bl);
     ::decode(bucket_name, bl);
     ::decode(bucket_id, bl);
     ::decode(obj_key, bl);
     ::decode(exp_time, bl);
+    if (struct_v >= 2) {
+      ::decode(tenant, bl);
+    } else {
+      tenant.clear();
+    }
     DECODE_FINISH(bl);
   }
 };
@@ -1143,10 +1151,10 @@ public:
 
 class RGWGetUserStats_CB : public RefCountedObject {
 protected:
-  string user;
+  rgw_user user;
   RGWStorageStats stats;
 public:
-  RGWGetUserStats_CB(const string& _user) : user(_user) {}
+  RGWGetUserStats_CB(const rgw_user& _user) : user(_user) {}
   virtual ~RGWGetUserStats_CB() {}
   virtual void handle_response(int r) = 0;
   virtual void set_response(RGWStorageStats& _stats) {
@@ -1208,7 +1216,7 @@ class RGWRados
   int open_gc_pool_ctx();
   int open_objexp_pool_ctx();
 
-  int open_bucket_pool_ctx(const string& bucket_name, const string& pool, librados::IoCtx&  io_ctx);
+  int open_bucket_pool_ctx(const string& pool, librados::IoCtx&  io_ctx);
   int open_bucket_index_ctx(rgw_bucket& bucket, librados::IoCtx&  index_ctx);
   int open_bucket_data_ctx(rgw_bucket& bucket, librados::IoCtx&  io_ctx);
   int open_bucket_data_extra_ctx(rgw_bucket& bucket, librados::IoCtx&  io_ctx);
@@ -1406,9 +1414,9 @@ public:
 
   // log bandwidth info
   int log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info);
-  int read_usage(string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+  int read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
                  bool *is_truncated, RGWUsageIter& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage);
-  int trim_usage(string& user, uint64_t start_epoch, uint64_t end_epoch);
+  int trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch);
 
   virtual int create_pool(rgw_bucket& bucket);
 
@@ -1417,12 +1425,12 @@ public:
    * returns 0 on success, -ERR# otherwise.
    */
   virtual int init_bucket_index(rgw_bucket& bucket, int num_shards);
-  int select_bucket_placement(RGWUserInfo& user_info, const string& region_name, const std::string& rule,
-                              const std::string& bucket_name, rgw_bucket& bucket, string *pselected_rule);
-  int select_legacy_bucket_placement(const string& bucket_name, rgw_bucket& bucket);
+  int select_bucket_placement(RGWUserInfo& user_info, const string& region_name, const string& rule,
+                              const string& tenant_name, const string& bucket_name, rgw_bucket& bucket, string *pselected_rule);
+  int select_legacy_bucket_placement(const string& tenant_name, const string& bucket_name, rgw_bucket& bucket);
   int select_new_bucket_location(RGWUserInfo& user_info, const string& region_name, const string& rule,
-                                 const std::string& bucket_name, rgw_bucket& bucket, string *pselected_rule);
-  int set_bucket_location_by_rule(const string& location_rule, const std::string& bucket_name, rgw_bucket& bucket);
+                                 const string& tenant_name, const string& bucket_name, rgw_bucket& bucket, string *pselected_rule);
+  int set_bucket_location_by_rule(const string& location_rule, const string& tenant_name, const string& bucket_name, rgw_bucket& bucket);
   virtual int create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
                             const string& region_name,
                             const string& placement_rule,
@@ -1600,7 +1608,7 @@ public:
         const string *ptag;
         list<rgw_obj_key> *remove_objs;
         time_t set_mtime;
-        string owner;
+        rgw_user owner;
         RGWObjCategory category;
         int flags;
         const char *if_match;
@@ -1623,7 +1631,7 @@ public:
       RGWRados::Object *target;
 
       struct DeleteParams {
-        string bucket_owner;
+        rgw_user bucket_owner;
         int versioning_status;
         ACLOwner obj_owner; /* needed for creation of deletion marker */
         uint64_t olh_epoch;
@@ -1790,7 +1798,7 @@ public:
 
   int rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj);
   int fetch_remote_obj(RGWObjectCtx& obj_ctx,
-                       const string& user_id,
+                       const rgw_user& user_id,
                        const string& client_id,
                        const string& op_id,
                        req_info *info,
@@ -1819,7 +1827,7 @@ public:
   int copy_obj_to_remote_dest(RGWObjState *astate,
                               map<string, bufferlist>& src_attrs,
                               RGWRados::Object::Read& read_op,
-                              const string& user_id,
+                              const rgw_user& user_id,
                               rgw_obj& dest_obj,
                               time_t *mtime);
   /**
@@ -1838,7 +1846,7 @@ public:
    * Returns: 0 on success, -ERR# otherwise.
    */
   virtual int copy_obj(RGWObjectCtx& obj_ctx,
-               const string& user_id,
+               const rgw_user& user_id,
                const string& client_id,
                const string& op_id,
                req_info *info,
@@ -2049,24 +2057,29 @@ public:
   int get_bucket_stats(rgw_bucket& bucket, string *bucket_ver, string *master_ver,
       map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker);
   int get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *cb);
-  int get_user_stats(const string& user, RGWStorageStats& stats);
-  int get_user_stats_async(const string& user, RGWGetUserStats_CB *cb);
+  int get_user_stats(const rgw_user& user, RGWStorageStats& stats);
+  int get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *cb);
   void get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj);
   void get_bucket_instance_entry(rgw_bucket& bucket, string& entry);
   void get_bucket_meta_oid(rgw_bucket& bucket, string& oid);
 
-  int put_bucket_entrypoint_info(const string& bucket_name, RGWBucketEntryPoint& entry_point, bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime,
+  int put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
+                                 bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime,
                                  map<string, bufferlist> *pattrs);
   int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, time_t mtime, map<string, bufferlist> *pattrs);
-  int get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx, const string& bucket_name, RGWBucketEntryPoint& entry_point, RGWObjVersionTracker *objv_tracker, time_t *pmtime,
-                                 map<string, bufferlist> *pattrs, rgw_cache_entry_info *cache_info = NULL);
+  int get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx, const string& tenant_name, const string& bucket_name,
+                                 RGWBucketEntryPoint& entry_point, RGWObjVersionTracker *objv_tracker,
+                                 time_t *pmtime, map<string, bufferlist> *pattrs, rgw_cache_entry_info *cache_info = NULL);
   int get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info, time_t *pmtime, map<string, bufferlist> *pattrs);
   int get_bucket_instance_info(RGWObjectCtx& obj_ctx, rgw_bucket& bucket, RGWBucketInfo& info, time_t *pmtime, map<string, bufferlist> *pattrs);
   int get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, string& oid, RGWBucketInfo& info, time_t *pmtime, map<string, bufferlist> *pattrs,
                                    rgw_cache_entry_info *cache_info = NULL);
 
-  int convert_old_bucket_info(RGWObjectCtx& obj_ctx, string& bucket_name);
-  virtual int get_bucket_info(RGWObjectCtx& obj_ctx, const string& bucket_name, RGWBucketInfo& info,
+  int convert_old_bucket_info(RGWObjectCtx& obj_ctx, const string& tenant_name, const string& bucket_name);
+  static void make_bucket_entry_name(const string& tenant_name, const string& bucket_name, string& bucket_entry);
+  virtual int get_bucket_info(RGWObjectCtx& obj_ctx,
+                              const string& tenant_name, const string& bucket_name,
+                              RGWBucketInfo& info,
                               time_t *pmtime, map<string, bufferlist> *pattrs = NULL);
   virtual int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t mtime, obj_version *pep_objv,
                                      map<string, bufferlist> *pattrs, bool create_entry_point);
@@ -2118,6 +2131,7 @@ public:
   void objexp_get_shard(int shard_num,
                         string& shard);                       /* out */
   int objexp_hint_add(const utime_t& delete_at,
+                      const string& tenant_name,
                       const string& bucket_name,
                       const string& bucket_id,
                       const rgw_obj_key& obj_key);
@@ -2174,11 +2188,11 @@ public:
   int cls_user_add_bucket(rgw_obj& obj, const cls_user_bucket_entry& entry);
   int cls_user_update_buckets(rgw_obj& obj, list<cls_user_bucket_entry>& entries, bool add);
   int cls_user_complete_stats_sync(rgw_obj& obj);
-  int complete_sync_user_stats(const string& user_id);
+  int complete_sync_user_stats(const rgw_user& user_id);
   int cls_user_add_bucket(rgw_obj& obj, list<cls_user_bucket_entry>& entries);
   int cls_user_remove_bucket(rgw_obj& obj, const cls_user_bucket& bucket);
 
-  int check_quota(const string& bucket_owner, rgw_bucket& bucket,
+  int check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
                   RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size);
 
   string unique_id(uint64_t unique_num) {
diff --git a/src/rgw/rgw_resolve.cc b/src/rgw/rgw_resolve.cc
index a696589..f29a005 100644
--- a/src/rgw/rgw_resolve.cc
+++ b/src/rgw/rgw_resolve.cc
@@ -18,12 +18,13 @@
 #define dout_subsys ceph_subsys_rgw
 
 class RGWDNSResolver {
-  list<res_state> states;
   Mutex lock;
+#ifdef HAVE_RES_NQUERY
+  list<res_state> states;
 
   int get_state(res_state *ps);
   void put_state(res_state s);
-
+#endif
 
 public:
   ~RGWDNSResolver();
@@ -33,14 +34,16 @@ public:
 
 RGWDNSResolver::~RGWDNSResolver()
 {
+#ifdef HAVE_RES_NQUERY
   list<res_state>::iterator iter;
   for (iter = states.begin(); iter != states.end(); ++iter) {
     struct __res_state *s = *iter;
     delete s;
   }
+#endif
 }
 
-
+#ifdef HAVE_RES_NQUERY
 int RGWDNSResolver::get_state(res_state *ps)
 {
   lock.Lock();
@@ -68,18 +71,19 @@ void RGWDNSResolver::put_state(res_state s)
   Mutex::Locker l(lock);
   states.push_back(s);
 }
-
+#endif
 
 int RGWDNSResolver::resolve_cname(const string& hostname, string& cname, bool *found)
 {
-  res_state res;
-
   *found = false;
 
+#ifdef HAVE_RES_NQUERY
+  res_state res;
   int r = get_state(&res);
   if (r < 0) {
     return r;
   }
+#endif
 
   int ret;
 
@@ -91,7 +95,18 @@ int RGWDNSResolver::resolve_cname(const string& hostname, string& cname, bool *f
   const char *origname = hostname.c_str();
   unsigned char *pt, *answer;
   unsigned char *answend;
-  int len = res_nquery(res, origname, C_IN, T_CNAME, buf, sizeof(buf));
+  int len;
+
+#ifdef HAVE_RES_NQUERY
+  len = res_nquery(res, origname, C_IN, T_CNAME, buf, sizeof(buf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    Mutex::Locker l(lock);
+# endif
+    len = res_query(origname, C_IN, T_CNAME, buf, sizeof(buf));
+  }
+#endif
   if (len < 0) {
     dout(20) << "res_query() failed" << dendl;
     ret = 0;
@@ -157,7 +172,9 @@ int RGWDNSResolver::resolve_cname(const string& hostname, string& cname, bool *f
   *found = true;
   ret = 0;
 done:
+#ifdef HAVE_RES_NQUERY
   put_state(res);
+#endif
   return ret;
 }
 
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index a73f12e..c557bc5 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -277,31 +277,51 @@ void rgw_flush_formatter(struct req_state *s, Formatter *formatter)
   }
 }
 
-void set_req_state_err(struct req_state *s, int err_no)
+void set_req_state_err(struct rgw_err& err,     /* out */
+                       int err_no,              /* in  */
+                       const int prot_flags)    /* in  */
 {
   const struct rgw_http_errors *r;
 
   if (err_no < 0)
     err_no = -err_no;
-  s->err.ret = -err_no;
-  if (s->prot_flags & RGW_REST_SWIFT) {
+  err.ret = -err_no;
+  if (prot_flags & RGW_REST_SWIFT) {
     r = search_err(err_no, RGW_HTTP_SWIFT_ERRORS, ARRAY_LEN(RGW_HTTP_SWIFT_ERRORS));
     if (r) {
-      s->err.http_ret = r->http_ret;
-      s->err.s3_code = r->s3_code;
+      err.http_ret = r->http_ret;
+      err.s3_code = r->s3_code;
       return;
     }
   }
   r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS));
   if (r) {
-    s->err.http_ret = r->http_ret;
-    s->err.s3_code = r->s3_code;
+    err.http_ret = r->http_ret;
+    err.s3_code = r->s3_code;
     return;
   }
   dout(0) << "WARNING: set_req_state_err err_no=" << err_no << " resorting to 500" << dendl;
 
-  s->err.http_ret = 500;
-  s->err.s3_code = "UnknownError";
+  err.http_ret = 500;
+  err.s3_code = "UnknownError";
+}
+
+void set_req_state_err(struct req_state * const s, const int err_no)
+{
+  if (s) {
+    set_req_state_err(s->err, err_no, s->prot_flags);
+  }
+}
+
+void dump_errno(int http_ret, string& out) {
+  stringstream ss;
+
+  ss <<  http_ret << " " << http_status_names[http_ret];
+  out = ss.str();
+}
+
+void dump_errno(const struct rgw_err &err, string& out) {
+  dump_errno(err.http_ret, out);
 }
 
 void dump_errno(struct req_state *s)
@@ -311,11 +331,11 @@ void dump_errno(struct req_state *s)
   dump_status(s, buf, http_status_names[s->err.http_ret]);
 }
 
-void dump_errno(struct req_state *s, int err)
+void dump_errno(struct req_state *s, int http_ret)
 {
   char buf[32];
-  snprintf(buf, sizeof(buf), "%d", err);
-  dump_status(s, buf, http_status_names[s->err.http_ret]);
+  snprintf(buf, sizeof(buf), "%d", http_ret);
+  dump_status(s, buf, http_status_names[http_ret]);
 }
 
 void dump_string_header(struct req_state *s, const char *name, const char *val)
@@ -366,9 +386,14 @@ void dump_bucket_from_state(struct req_state *s)
 {
   int expose_bucket = g_conf->rgw_expose_bucket;
   if (expose_bucket) {
-    if (!s->bucket_name_str.empty()) {
+    if (!s->bucket_name.empty()) {
       string b;
-      url_encode(s->bucket_name_str, b);
+      if (!s->bucket_tenant.empty()) {
+        string g = s->bucket_tenant + "/" + s->bucket_name;
+        url_encode(g, b);
+      } else {
+        url_encode(s->bucket_name, b);
+      }
       s->cio->print("Bucket: %s\r\n", b.c_str());
     }
   }
@@ -382,8 +407,12 @@ void dump_uri_from_state(struct req_state *s)
     string server = s->info.env->get("SERVER_NAME", "<SERVER_NAME>");
     location.append(server);
     location += "/";
-    if (!s->bucket_name_str.empty()) {
-      location += s->bucket_name_str;
+    if (!s->bucket_name.empty()) {
+      if (!s->bucket_tenant.empty()) {
+        location += s->bucket_tenant;
+        location += ":";
+      }
+      location += s->bucket_name;
       location += "/";
       if (!s->object.empty()) {
         location += s->object.name;
@@ -452,12 +481,12 @@ void dump_time(struct req_state *s, const char *name, time_t *t)
   s->formatter->dump_string(name, buf);
 }
 
-void dump_owner(struct req_state *s, string& id, string& name, const char *section)
+void dump_owner(struct req_state *s, rgw_user& id, string& name, const char *section)
 {
   if (!section)
     section = "Owner";
   s->formatter->open_object_section(section);
-  s->formatter->dump_string("ID", id);
+  s->formatter->dump_string("ID", id.to_str());
   s->formatter->dump_string("DisplayName", name);
   s->formatter->close_section();
 }
@@ -1072,9 +1101,8 @@ int RGWListBucketMultiparts_ObjStore::get_params()
 
 int RGWDeleteMultiObj_ObjStore::get_params()
 {
-  bucket_name = s->bucket_name_str;
 
-  if (bucket_name.empty()) {
+  if (s->bucket_name.empty()) {
     ret = -EINVAL;
     return ret;
   }
@@ -1149,13 +1177,22 @@ int RGWHandler_ObjStore::allocate_formatter(struct req_state *s, int default_typ
     }
   }
 
+  const string& mm = s->info.args.get("multipart-manifest");
+  const bool multipart_delete = (mm.compare("delete") == 0);
+
   switch (s->format) {
     case RGW_FORMAT_PLAIN:
-      s->formatter = new RGWFormatter_Plain;
-      break;
+      {
+        const bool use_kv_syntax = s->info.args.exists("bulk-delete") || multipart_delete;
+        s->formatter = new RGWFormatter_Plain(use_kv_syntax);
+        break;
+      }
     case RGW_FORMAT_XML:
-      s->formatter = new XMLFormatter(false);
-      break;
+      {
+        const bool lowercase_underscore = s->info.args.exists("bulk-delete") || multipart_delete;
+        s->formatter = new XMLFormatter(false, lowercase_underscore);
+        break;
+      }
     case RGW_FORMAT_JSON:
       s->formatter = new JSONFormatter(false);
       break;
@@ -1168,6 +1205,17 @@ int RGWHandler_ObjStore::allocate_formatter(struct req_state *s, int default_typ
   return 0;
 }
 
+int RGWHandler_ObjStore::validate_tenant_name(string const& t)
+{
+  struct tench {
+    static bool is_good(char ch) {
+      return isalnum(ch) || ch == '_';
+    }
+  };
+  std::string::const_iterator it = std::find_if(t.begin(), t.end(), tench::is_good);
+  return (it == t.end())? 0: -ERR_INVALID_BUCKET_NAME;
+}
+
 // This function enforces Amazon's spec for bucket names.
 // (The requirements, not the recommendations.)
 int RGWHandler_ObjStore::validate_bucket_name(const string& bucket)
@@ -1392,7 +1440,7 @@ int RGWREST::preprocess(struct req_state *s, RGWClientIO *cio)
       string encoded_bucket = "/";
       encoded_bucket.append(subdomain);
       if (s->info.request_uri[0] != '/')
-        encoded_bucket.append("/'");
+        encoded_bucket.append("/");
       encoded_bucket.append(s->info.request_uri);
       s->info.request_uri = encoded_bucket;
     }
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index c3dc847..9b49a1e 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -59,6 +59,35 @@ int rgw_rest_get_json_input(CephContext *cct, req_state *s, T& out, int max_len,
   return 0;
 }
 
+template <class T>
+int rgw_rest_get_json_input_keep_data(CephContext *cct, req_state *s, T& out, int max_len, char **pdata, int *len)
+{
+  int rv, data_len;
+  char *data;
+
+  if ((rv = rgw_rest_read_all_input(s, &data, &data_len, max_len)) < 0) {
+    return rv;
+  }
+
+  if (!data_len) {
+    return -EINVAL;
+  }
+
+  *len = data_len;
+
+  JSONParser parser;
+
+  if (!parser.parse(data, data_len)) {
+    free(data);
+    return -EINVAL;
+  }
+
+  decode_json_obj(out, &parser);
+
+  *pdata = data;
+  return 0;
+}
+
 
 class RESTArgs {
 public:
@@ -270,6 +299,12 @@ public:
   int get_params();
 };
 
+class RGWBulkDelete_ObjStore : public RGWBulkDelete {
+public:
+  RGWBulkDelete_ObjStore() {}
+  ~RGWBulkDelete_ObjStore() {}
+};
+
 class RGWDeleteMultiObj_ObjStore : public RGWDeleteMultiObj {
 public:
   RGWDeleteMultiObj_ObjStore() {}
@@ -304,6 +339,7 @@ protected:
   virtual RGWOp *op_copy() { return NULL; }
   virtual RGWOp *op_options() { return NULL; }
 
+  virtual int validate_tenant_name(const string& bucket);
   virtual int validate_bucket_name(const string& bucket);
   virtual int validate_object_name(const string& object);
 
@@ -367,9 +403,12 @@ public:
 
 static const int64_t NO_CONTENT_LENGTH = -1;
 
+extern void set_req_state_err(struct rgw_err &err, int err_no, int prot_flags);
 extern void set_req_state_err(struct req_state *s, int err_no);
+extern void dump_errno(int http_ret, string& out);
+extern void dump_errno(const struct rgw_err &err, string& out);
 extern void dump_errno(struct req_state *s);
-extern void dump_errno(struct req_state *s, int ret);
+extern void dump_errno(struct req_state *s, int http_ret);
 extern void end_header(struct req_state *s,
                        RGWOp *op = NULL,
                        const char *content_type = NULL,
@@ -377,7 +416,7 @@ extern void end_header(struct req_state *s,
 		       bool force_content_type = false);
 extern void dump_start(struct req_state *s);
 extern void list_all_buckets_start(struct req_state *s);
-extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL);
+extern void dump_owner(struct req_state *s, rgw_user& id, string& name, const char *section = NULL);
 extern void dump_string_header(struct req_state *s, const char *name, const char *val);
 extern void dump_content_length(struct req_state *s, uint64_t len);
 extern void dump_etag(struct req_state *s, const char *etag);
diff --git a/src/rgw/rgw_rest_bucket.cc b/src/rgw/rgw_rest_bucket.cc
index 772f286..bf14b1e 100644
--- a/src/rgw/rgw_rest_bucket.cc
+++ b/src/rgw/rgw_rest_bucket.cc
@@ -29,14 +29,16 @@ void RGWOp_Bucket_Info::execute()
 
   bool fetch_stats;
 
-  std::string uid;
   std::string bucket;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  string uid_str;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "bucket", bucket, &bucket);
   RESTArgs::get_bool(s, "stats", false, &fetch_stats);
 
-
   op_state.set_user_id(uid);
   op_state.set_bucket_name(bucket);
   op_state.set_fetch_stats(fetch_stats);
@@ -124,16 +126,17 @@ public:
 
 void RGWOp_Bucket_Link::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string bucket;
   std::string bucket_id;
 
   RGWBucketAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
   RESTArgs::get_string(s, "bucket", bucket, &bucket);
   RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
 
+  rgw_user uid(uid_str);
   op_state.set_user_id(uid);
   op_state.set_bucket_name(bucket);
   op_state.set_bucket_id(bucket_id);
@@ -157,12 +160,14 @@ public:
 
 void RGWOp_Bucket_Unlink::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string bucket;
 
   RGWBucketAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "bucket", bucket, &bucket);
 
   op_state.set_user_id(uid);
diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc
index 6ec44ea..4eaf42e 100644
--- a/src/rgw/rgw_rest_client.cc
+++ b/src/rgw/rgw_rest_client.cc
@@ -329,9 +329,9 @@ static void grants_by_type_add_one_grant(map<int, string>& grants_by_type, int p
     default:
       id_type_str = "id";
   }
-  string id;
+  rgw_user id;
   grant.get_id(id);
-  s.append(id_type_str + "=\"" + id + "\"");
+  s.append(id_type_str + "=\"" + id.to_str() + "\"");
 }
 
 struct grant_type_to_header {
diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc
index cbffad0..f15d649 100644
--- a/src/rgw/rgw_rest_conn.cc
+++ b/src/rgw/rgw_rest_conn.cc
@@ -30,14 +30,15 @@ int RGWRESTConn::get_url(string& endpoint)
   return 0;
 }
 
-int RGWRESTConn::forward(const string& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl)
+int RGWRESTConn::forward(const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl)
 {
   string url;
   int ret = get_url(url);
   if (ret < 0)
     return ret;
+  string uid_str = uid.to_str();
   list<pair<string, string> > params;
-  params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "uid", uid));
+  params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "uid", uid_str));
   params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "region", region));
   if (objv) {
     params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "tag", objv->tag));
@@ -55,7 +56,7 @@ public:
     StreamObjData(rgw_obj& _obj) : obj(_obj) {}
 };
 
-int RGWRESTConn::put_obj_init(const string& uid, rgw_obj& obj, uint64_t obj_size,
+int RGWRESTConn::put_obj_init(const rgw_user& uid, rgw_obj& obj, uint64_t obj_size,
                                       map<string, bufferlist>& attrs, RGWRESTStreamWriteRequest **req)
 {
   string url;
@@ -63,8 +64,9 @@ int RGWRESTConn::put_obj_init(const string& uid, rgw_obj& obj, uint64_t obj_size
   if (ret < 0)
     return ret;
 
+  string uid_str = uid.to_str();
   list<pair<string, string> > params;
-  params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "uid", uid));
+  params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "uid", uid_str));
   params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "region", region));
   *req = new RGWRESTStreamWriteRequest(cct, url, NULL, &params);
   return (*req)->put_obj_init(key, obj, obj_size, attrs);
@@ -78,7 +80,7 @@ int RGWRESTConn::complete_request(RGWRESTStreamWriteRequest *req, string& etag,
   return ret;
 }
 
-int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata,
+int RGWRESTConn::get_obj(const rgw_user& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata,
                                  RGWGetDataCB *cb, RGWRESTStreamReadRequest **req)
 {
   string url;
@@ -86,8 +88,9 @@ int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_o
   if (ret < 0)
     return ret;
 
+  string uid_str = uid.to_str();
   list<pair<string, string> > params;
-  params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "uid", uid));
+  params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "uid", uid_str));
   params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "region", region));
   if (prepend_metadata) {
     params.push_back(pair<string, string>(RGW_SYS_PARAM_PREFIX "prepend-metadata", region));
diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h
index 209ddcf..d7620da 100644
--- a/src/rgw/rgw_rest_conn.h
+++ b/src/rgw/rgw_rest_conn.h
@@ -23,14 +23,14 @@ public:
   int get_url(string& endpoint);
 
   /* sync request */
-  int forward(const string& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl);
+  int forward(const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl);
 
   /* async request */
-  int put_obj_init(const string& uid, rgw_obj& obj, uint64_t obj_size,
+  int put_obj_init(const rgw_user& uid, rgw_obj& obj, uint64_t obj_size,
                    map<string, bufferlist>& attrs, RGWRESTStreamWriteRequest **req);
   int complete_request(RGWRESTStreamWriteRequest *req, string& etag, time_t *mtime);
 
-  int get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req);
+  int get_obj(const rgw_user& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req);
   int complete_request(RGWRESTStreamReadRequest *req, string& etag, time_t *mtime, map<string, string>& attrs);
 };
 
diff --git a/src/rgw/rgw_rest_log.cc b/src/rgw/rgw_rest_log.cc
index 7e3707f..2ba2ee7 100644
--- a/src/rgw/rgw_rest_log.cc
+++ b/src/rgw/rgw_rest_log.cc
@@ -269,7 +269,8 @@ void RGWOp_MDLog_Unlock::execute() {
 }
 
 void RGWOp_BILog_List::execute() {
-  string bucket_name = s->info.args.get("bucket"),
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
          marker = s->info.args.get("marker"),
          max_entries_str = s->info.args.get("max-entries"),
          bucket_instance = s->info.args.get("bucket-instance");
@@ -297,7 +298,7 @@ void RGWOp_BILog_List::execute() {
       return;
     }
   } else { /* !bucket_name.empty() */
-    http_ret = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL, NULL);
+    http_ret = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, NULL);
     if (http_ret < 0) {
       dout(5) << "could not get bucket info for bucket=" << bucket_name << dendl;
       return;
@@ -364,7 +365,8 @@ void RGWOp_BILog_List::send_response_end() {
 }
       
 void RGWOp_BILog_Info::execute() {
-  string bucket_name = s->info.args.get("bucket"),
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
          bucket_instance = s->info.args.get("bucket-instance");
   RGWBucketInfo bucket_info;
 
@@ -383,7 +385,7 @@ void RGWOp_BILog_Info::execute() {
       return;
     }
   } else { /* !bucket_name.empty() */
-    http_ret = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL, NULL);
+    http_ret = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, NULL);
     if (http_ret < 0) {
       dout(5) << "could not get bucket info for bucket=" << bucket_name << dendl;
       return;
@@ -415,7 +417,8 @@ void RGWOp_BILog_Info::send_response() {
 }
 
 void RGWOp_BILog_Delete::execute() {
-  string bucket_name = s->info.args.get("bucket"),
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
          start_marker = s->info.args.get("start-marker"),
          end_marker = s->info.args.get("end-marker"),
          bucket_instance = s->info.args.get("bucket-instance");
@@ -445,7 +448,7 @@ void RGWOp_BILog_Delete::execute() {
       return;
     }
   } else { /* !bucket_name.empty() */
-    http_ret = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL, NULL);
+    http_ret = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, NULL);
     if (http_ret < 0) {
       dout(5) << "could not get bucket info for bucket=" << bucket_name << dendl;
       return;
diff --git a/src/rgw/rgw_rest_metadata.cc b/src/rgw/rgw_rest_metadata.cc
index 1068076..f97fdb4 100644
--- a/src/rgw/rgw_rest_metadata.cc
+++ b/src/rgw/rgw_rest_metadata.cc
@@ -32,8 +32,8 @@ static inline void frame_metadata_key(req_state *s, string& out) {
   string key = s->info.args.get("key", &exists);
 
   string section;
-  if (!s->bucket_name_str.empty()) {
-    section = s->bucket_name_str;
+  if (!s->bucket_name.empty()) {
+    section = s->bucket_name;
   } else {
     section = key;
     key.clear();
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 5831869..8c00e19 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -257,7 +257,9 @@ void RGWListBucket_ObjStore_S3::send_versioned_response()
 {
   s->formatter->open_object_section_in_ns("ListVersionsResult",
 					  "http://s3.amazonaws.com/doc/2006-03-01/");
-  s->formatter->dump_string("Name", s->bucket_name_str);
+  if (!s->bucket_tenant.empty())
+    s->formatter->dump_string("Tenant", s->bucket_tenant);
+  s->formatter->dump_string("Name", s->bucket_name);
   s->formatter->dump_string("Prefix", prefix);
   s->formatter->dump_string("KeyMarker", marker.name);
   if (is_truncated && !next_marker.empty())
@@ -334,7 +336,9 @@ void RGWListBucket_ObjStore_S3::send_response()
 
   s->formatter->open_object_section_in_ns("ListBucketResult",
 					  "http://s3.amazonaws.com/doc/2006-03-01/");
-  s->formatter->dump_string("Name", s->bucket_name_str);
+  if (!s->bucket_tenant.empty())
+    s->formatter->dump_string("Tenant", s->bucket_tenant);
+  s->formatter->dump_string("Name", s->bucket_name);
   s->formatter->dump_string("Prefix", prefix);
   s->formatter->dump_string("Marker", marker.name);
   if (is_truncated && !next_marker.empty())
@@ -1195,8 +1199,9 @@ int RGWPostObj_ObjStore_S3::get_policy()
       user_info.user_id = keystone_validator.response.token.tenant.id;
       user_info.display_name = keystone_validator.response.token.tenant.name;
 
+      rgw_user uid(keystone_validator.response.token.tenant.id);
       /* try to store user if it not already exists */
-      if (rgw_get_user_info_by_uid(store, keystone_validator.response.token.tenant.id, user_info) < 0) {
+      if (rgw_get_user_info_by_uid(store, uid, user_info) < 0) {
         int ret = rgw_store_user_info(store, user_info, NULL, NULL, 0, true);
         if (ret < 0) {
           dout(10) << "NOTICE: failed to store new user's info: ret=" << ret << dendl;
@@ -1346,6 +1351,7 @@ void RGWPostObj_ObjStore_S3::send_response()
 
     part_str("success_action_redirect", &redirect);
 
+    string tenant;
     string bucket;
     string key;
     string etag_str = "\"";
@@ -1355,12 +1361,28 @@ void RGWPostObj_ObjStore_S3::send_response()
 
     string etag_url;
 
-    url_encode(s->bucket_name_str, bucket);
+    url_encode(s->bucket_tenant, tenant); /* surely overkill, but cheap */
+    url_encode(s->bucket_name, bucket);
     url_encode(s->object.name, key);
     url_encode(etag_str, etag_url);
 
-    redirect.append("?bucket=");
-    redirect.append(bucket);
+    if (!s->bucket_tenant.empty()) {
+      /*
+       * What we really would like is to quaily the bucket name, so
+       * that the client could simply copy it and paste into next request.
+       * Unfortunately, in S3 we cannot know if the client will decide
+       * to come through DNS, with "bucket.tenant" sytanx, or through
+       * URL with "tenant\bucket" syntax. Therefore, we provide the
+       * tenant separately.
+       */
+      redirect.append("?tenant=");
+      redirect.append(tenant);
+      redirect.append("&bucket=");
+      redirect.append(bucket);
+    } else {
+      redirect.append("?bucket=");
+      redirect.append(bucket);
+    }
     redirect.append("&key=");
     redirect.append(key);
     redirect.append("&etag=");
@@ -1404,7 +1426,9 @@ done:
     s->formatter->open_object_section("PostResponse");
     if (g_conf->rgw_dns_name.length())
       s->formatter->dump_format("Location", "%s/%s", s->info.script_uri.c_str(), s->object.name.c_str());
-    s->formatter->dump_string("Bucket", s->bucket_name_str);
+    if (!s->bucket_tenant.empty())
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    s->formatter->dump_string("Bucket", s->bucket_name);
     s->formatter->dump_string("Key", s->object.name);
     s->formatter->close_section();
   }
@@ -1466,8 +1490,10 @@ int RGWCopyObj_ObjStore_S3::get_params()
   if_match = s->info.env->get("HTTP_X_AMZ_COPY_IF_MATCH");
   if_nomatch = s->info.env->get("HTTP_X_AMZ_COPY_IF_NONE_MATCH");
 
+  src_tenant_name = s->src_tenant_name;
   src_bucket_name = s->src_bucket_name;
   src_object = s->src_object;
+  dest_tenant_name = s->bucket.tenant;
   dest_bucket_name = s->bucket.name;
   dest_object = s->object.name;
 
@@ -1499,6 +1525,7 @@ int RGWCopyObj_ObjStore_S3::get_params()
   }
 
   if (source_zone.empty() &&
+      (dest_tenant_name.compare(src_tenant_name) == 0) &&
       (dest_bucket_name.compare(src_bucket_name) == 0) &&
       (dest_object.compare(src_object.name) == 0) &&
       src_object.instance.empty() &&
@@ -1815,7 +1842,9 @@ void RGWInitMultipart_ObjStore_S3::send_response()
     dump_start(s);
     s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult",
 		  "http://s3.amazonaws.com/doc/2006-03-01/");
-    s->formatter->dump_string("Bucket", s->bucket_name_str);
+    if (!s->bucket_tenant.empty())
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    s->formatter->dump_string("Bucket", s->bucket_name);
     s->formatter->dump_string("Key", s->object.name);
     s->formatter->dump_string("UploadId", upload_id);
     s->formatter->close_section();
@@ -1833,9 +1862,22 @@ void RGWCompleteMultipart_ObjStore_S3::send_response()
     dump_start(s);
     s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult",
 			  "http://s3.amazonaws.com/doc/2006-03-01/");
-    if (s->info.domain.length())
-      s->formatter->dump_format("Location", "%s.%s", s->bucket_name_str.c_str(), s->info.domain.c_str());
-    s->formatter->dump_string("Bucket", s->bucket_name_str);
+    if (!s->bucket_tenant.empty()) {
+      if (s->info.domain.length()) {
+        s->formatter->dump_format("Location", "%s.%s.%s",
+          s->bucket_name.c_str(),
+          s->bucket_tenant.c_str(),
+          s->info.domain.c_str());
+      }
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    } else {
+      if (s->info.domain.length()) {
+        s->formatter->dump_format("Location", "%s.%s",
+          s->bucket_name.c_str(),
+          s->info.domain.c_str());
+      }
+    }
+    s->formatter->dump_string("Bucket", s->bucket_name);
     s->formatter->dump_string("Key", s->object.name);
     s->formatter->dump_string("ETag", etag);
     s->formatter->close_section();
@@ -1874,7 +1916,9 @@ void RGWListMultipart_ObjStore_S3::send_response()
     if (test_iter != parts.rend()) {
       cur_max = test_iter->first;
     }
-    s->formatter->dump_string("Bucket", s->bucket_name_str);
+    if (!s->bucket_tenant.empty())
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    s->formatter->dump_string("Bucket", s->bucket_name);
     s->formatter->dump_string("Key", s->object.name);
     s->formatter->dump_string("UploadId", upload_id);
     s->formatter->dump_string("StorageClass", "STANDARD");
@@ -1922,7 +1966,9 @@ void RGWListBucketMultiparts_ObjStore_S3::send_response()
     return;
 
   s->formatter->open_object_section("ListMultipartUploadsResult");
-  s->formatter->dump_string("Bucket", s->bucket_name_str);
+  if (!s->bucket_tenant.empty())
+    s->formatter->dump_string("Tenant", s->bucket_tenant);
+  s->formatter->dump_string("Bucket", s->bucket_name);
   if (!prefix.empty())
     s->formatter->dump_string("ListMultipartUploadsResult.Prefix", prefix);
   string& key_marker = marker.get_key();
@@ -2160,7 +2206,7 @@ RGWOp *RGWHandler_ObjStore_Obj_S3::op_put()
   if (is_acl_op()) {
     return new RGWPutACLs_ObjStore_S3;
   }
-  if (!s->copy_source)
+  if (s->src_bucket_name.empty())
     return new RGWPutObj_ObjStore_S3;
   else
     return new RGWCopyObj_ObjStore_S3;
@@ -2230,8 +2276,22 @@ int RGWHandler_ObjStore_S3::init_from_header(struct req_state *s, int default_fo
     first = req;
   }
 
-  if (s->bucket_name_str.empty()) {
-    s->bucket_name_str = first;
+  /*
+   * XXX The intent of the check for empty is apparently to let the bucket
+   * name from DNS to be set ahead. However, we currently take the DNS
+   * bucket and re-insert it into URL in rgw_rest.cc:RGWREST::preprocess().
+   * So, this check is meaningless.
+   *
+   * Rather than dropping this, the code needs to be changed into putting
+   * the bucket (and its tenant) from DNS and Host: header (HTTP_HOST)
+   * into req_status.bucket_name directly.
+   */
+  if (s->bucket_name.empty()) {
+    rgw_parse_url_bucket(first, s->bucket_tenant, s->bucket_name);
+    if (s->bucket_tenant.empty())
+      s->bucket_tenant = s->user.user_id.tenant;
+
+    ldout(s->cct, 20) << "s->user.user_id=" << s->user.user_id << " s->bucket_tenant=" << s->bucket_tenant << " s->bucket_name=" << s->bucket_name << dendl;
 
     if (pos >= 0) {
       string encoded_obj_str = req.substr(pos+1);
@@ -2303,10 +2363,15 @@ int RGWHandler_ObjStore_S3::validate_bucket_name(const string& bucket, bool rela
 
 int RGWHandler_ObjStore_S3::init(RGWRados *store, struct req_state *s, RGWClientIO *cio)
 {
-  dout(10) << "s->object=" << (!s->object.empty() ? s->object : rgw_obj_key("<NULL>")) << " s->bucket=" << (!s->bucket_name_str.empty() ? s->bucket_name_str : "<NULL>") << dendl;
+  dout(10) << "s->object=" << (!s->object.empty() ? s->object : rgw_obj_key("<NULL>"))
+           << " s->bucket=" << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) << dendl;
 
+  int ret;
+  ret = validate_tenant_name(s->bucket_tenant);
+  if (ret)
+    return ret;
   bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names;
-  int ret = validate_bucket_name(s->bucket_name_str, relaxed_names);
+  ret = validate_bucket_name(s->bucket_name, relaxed_names);
   if (ret)
     return ret;
   ret = validate_object_name(s->object.name);
@@ -2319,13 +2384,17 @@ int RGWHandler_ObjStore_S3::init(RGWRados *store, struct req_state *s, RGWClient
 
   s->has_acl_header = s->info.env->exists_prefix("HTTP_X_AMZ_GRANT");
 
-  s->copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE");
-  if (s->copy_source) {
-    ret = RGWCopyObj::parse_copy_location(s->copy_source, s->src_bucket_name, s->src_object);
+  const char *copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE");
+  if (copy_source) {
+    string src_bucket_str;
+    ret = RGWCopyObj::parse_copy_location(copy_source, src_bucket_str, s->src_object);
     if (!ret) {
       ldout(s->cct, 0) << "failed to parse copy location" << dendl;
-      return -EINVAL;
+      return -EINVAL; // XXX why not -ERR_INVALID_BUCKET_NAME or -ERR_BAD_URL?
     }
+    rgw_parse_url_bucket(src_bucket_str, s->src_tenant_name, s->src_bucket_name);
+    if (s->src_tenant_name.empty())
+      s->src_tenant_name = s->user.user_id.tenant;
   }
 
   s->dialect = "s3";
@@ -2487,14 +2556,19 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
 	s->user.user_id = keystone_validator.response.token.tenant.id;
         s->user.display_name = keystone_validator.response.token.tenant.name; // wow.
 
+        rgw_user uid(keystone_validator.response.token.tenant.id);
         /* try to store user if it not already exists */
-        if (rgw_get_user_info_by_uid(store, keystone_validator.response.token.tenant.id, s->user) < 0) {
+        if (rgw_get_user_info_by_uid(store, uid, s->user) < 0) {
           int ret = rgw_store_user_info(store, s->user, NULL, NULL, 0, true);
           if (ret < 0)
             dout(10) << "NOTICE: failed to store new user's info: ret=" << ret << dendl;
         }
 
         s->perm_mask = RGW_PERM_FULL_CONTROL;
+
+        if (s->bucket_tenant.empty()) {
+          s->bucket_tenant = s->user.user_id.tenant;
+        }
       }
     }
   }
@@ -2512,6 +2586,10 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
       return -ERR_INVALID_ACCESS_KEY;
     }
 
+    if (s->bucket_tenant.empty()) {
+      s->bucket_tenant = s->user.user_id.tenant;
+    }
+
     /* now verify signature */
 
     string auth_hdr;
@@ -2568,7 +2646,8 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
       string effective_uid = s->info.args.get(RGW_SYS_PARAM_PREFIX "uid");
       RGWUserInfo effective_user;
       if (!effective_uid.empty()) {
-        ret = rgw_get_user_info_by_uid(store, effective_uid, effective_user);
+        rgw_user euid(effective_uid);
+        ret = rgw_get_user_info_by_uid(store, euid, effective_user);
         if (ret < 0) {
           ldout(s->cct, 0) << "User lookup failed!" << dendl;
           return -ENOENT;
@@ -2602,7 +2681,7 @@ RGWHandler *RGWRESTMgr_S3::get_handler(struct req_state *s)
   if (ret < 0)
     return NULL;
 
-  if (s->bucket_name_str.empty())
+  if (s->bucket_name.empty())
     return new RGWHandler_ObjStore_Service_S3;
 
   if (s->object.empty())
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 35aa146..6fed37c 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -558,14 +558,47 @@ int RGWPutObj_ObjStore_SWIFT::get_params()
 
   policy.create_default(s->user.user_id, s->user.display_name);
 
-  obj_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
-
   int r = get_delete_at_param(s, &delete_at);
   if (r < 0) {
     ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl;
     return r;
   }
 
+  dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
+  bool exists;
+  string multipart_manifest = s->info.args.get("multipart-manifest", &exists);
+  if (exists) {
+    if (multipart_manifest != "put") {
+      ldout(s->cct, 5) << "invalid multipart-manifest http param: " << multipart_manifest << dendl;
+      return -EINVAL;
+    }
+
+#define MAX_SLO_ENTRY_SIZE (1024 + 128) // 1024 - max obj name, 128 - enough extra for other info
+    uint64_t max_len = s->cct->_conf->rgw_max_slo_entries * MAX_SLO_ENTRY_SIZE;
+    
+    slo_info = new RGWSLOInfo;
+    
+    int r = rgw_rest_get_json_input_keep_data(s->cct, s, slo_info->entries, max_len, &slo_info->raw_data, &slo_info->raw_data_len);
+    if (r < 0) {
+      ldout(s->cct, 5) << "failed to read input for slo r=" << r << dendl;
+      return r;
+    }
+
+    if ((int64_t)slo_info->entries.size() > s->cct->_conf->rgw_max_slo_entries) {
+      ldout(s->cct, 5) << "too many entries in slo request: " << slo_info->entries.size() << dendl;
+      return -EINVAL;
+    }
+
+    uint64_t total_size = 0;
+    for (vector<rgw_slo_entry>::iterator iter = slo_info->entries.begin(); iter != slo_info->entries.end(); ++iter) {
+      total_size += iter->size_bytes;
+      ldout(s->cct, 20) << "slo_part: " << iter->path << " size=" << iter->size_bytes << dendl;
+    }
+    slo_info->total_size = total_size;
+
+    ofs = slo_info->raw_data_len;
+  }
+
   return RGWPutObj_ObjStore::get_params();
 }
 
@@ -679,6 +712,8 @@ int RGWPutMetadataObject_ObjStore_SWIFT::get_params()
   }
 
   placement_rule = s->info.env->get("HTTP_X_STORAGE_POLICY", "");
+  dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
+
   return 0;
 }
 
@@ -696,16 +731,107 @@ void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
+static void bulkdelete_respond(const unsigned num_deleted,
+                               const unsigned int num_unfound,
+                               const std::list<RGWBulkDelete::fail_desc_t>& failures,
+                               const int prot_flags,                  /* in  */
+                               ceph::Formatter& formatter)            /* out */
+{
+  formatter.open_object_section("delete");
+
+  string resp_status;
+  string resp_body;
+
+  if (!failures.empty()) {
+    int reason = ERR_INVALID_REQUEST;
+    for (const auto fail_desc : failures) {
+      if (-ENOENT != fail_desc.err && -EACCES != fail_desc.err) {
+        reason = fail_desc.err;
+      }
+    }
+
+    rgw_err err;
+    set_req_state_err(err, reason, prot_flags);
+    dump_errno(err, resp_status);
+  } else if (0 == num_deleted && 0 == num_unfound) {
+    /* 400 Bad Request */
+    dump_errno(400, resp_status);
+    resp_body = "Invalid bulk delete.";
+  } else {
+    /* 200 OK */
+    dump_errno(200, resp_status);
+  }
+
+  formatter.dump_int("Number Deleted", num_deleted);
+  formatter.dump_int("Number Not Found", num_unfound);
+  formatter.dump_string("Response Body", resp_body);
+  formatter.dump_string("Response Status", resp_status);
+  formatter.open_array_section("Errors");
+  for (const auto fail_desc : failures) {
+    formatter.open_array_section("object");
+
+    stringstream ss_name;
+    ss_name << fail_desc.path;
+    formatter.dump_string("Name", ss_name.str());
+
+    rgw_err err;
+    set_req_state_err(err, fail_desc.err, prot_flags);
+    string status;
+    dump_errno(err, status);
+    formatter.dump_string("Status", status);
+    formatter.close_section();
+  }
+  formatter.close_section();
+
+  formatter.close_section();
+}
+
+int RGWDeleteObj_ObjStore_SWIFT::get_params()
+{
+  const string& mm = s->info.args.get("multipart-manifest");
+  multipart_delete = (mm.compare("delete") == 0);
+
+  return RGWDeleteObj_ObjStore::get_params();
+}
+
 void RGWDeleteObj_ObjStore_SWIFT::send_response()
 {
   int r = ret;
-  if (!r)
+
+  if (multipart_delete) {
+    r = 0;
+  } else if(!r) {
     r = STATUS_NO_CONTENT;
+  }
 
   set_req_state_err(s, r);
   dump_errno(s);
   end_header(s, this);
+
+  if (multipart_delete) {
+    if (deleter) {
+      bulkdelete_respond(deleter->get_num_deleted(),
+                         deleter->get_num_unfound(),
+                         deleter->get_failures(),
+                         s->prot_flags,
+                         *s->formatter);
+    } else if (-ENOENT == ret) {
+      bulkdelete_respond(0, 1, {}, s->prot_flags, *s->formatter);
+    } else {
+      RGWBulkDelete::acct_path_t path;
+      path.bucket_name = s->bucket_name;
+      path.obj_key = s->object;
+
+      RGWBulkDelete::fail_desc_t fail_desc;
+      fail_desc.err = ret;
+      fail_desc.path = path;
+
+      bulkdelete_respond(0, 0, { fail_desc }, s->prot_flags, *s->formatter);
+    }
+  }
+
   rgw_flush_formatter_and_reset(s, s->formatter);
+
 }
 
 static void get_contype_from_attrs(map<string, bufferlist>& attrs,
@@ -766,9 +892,12 @@ int RGWCopyObj_ObjStore_SWIFT::get_params()
   if_match = s->info.env->get("HTTP_COPY_IF_MATCH");
   if_nomatch = s->info.env->get("HTTP_COPY_IF_NONE_MATCH");
 
+  /* XXX why copy this? just use req_state in rgw_op.cc:verify_permission */
+  src_tenant_name = s->src_tenant_name;
   src_bucket_name = s->src_bucket_name;
   src_object = s->src_object;
-  dest_bucket_name = s->bucket_name_str;
+  dest_tenant_name = s->bucket_tenant;
+  dest_bucket_name = s->bucket_name;
   dest_object = s->object.name;
 
   const char * const fresh_meta = s->info.env->get("HTTP_X_FRESH_METADATA");
@@ -819,7 +948,7 @@ void RGWCopyObj_ObjStore_SWIFT::dump_copy_info()
 
   /* Dump X-Copied-From-Account */
   string account_name;
-  url_encode(s->user.user_id, account_name);
+  url_encode(s->user.user_id.id, account_name); // XXX tenant
   s->cio->print("X-Copied-From-Account: %s\r\n", account_name.c_str());
 
   /* Dump X-Copied-From-Last-Modified. */
@@ -882,6 +1011,9 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
   dump_content_length(s, total_len);
   dump_last_modified(s, lastmod);
   s->cio->print("X-Timestamp: %lld.00000\r\n", (long long)lastmod);
+  if (is_slo) {
+    s->cio->print("X-Static-Large-Object: True\r\n");
+  }
 
   if (!ret) {
     map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
@@ -933,6 +1065,62 @@ void RGWOptionsCORS_ObjStore_SWIFT::send_response()
   end_header(s, NULL);
 }
 
+int RGWBulkDelete_ObjStore_SWIFT::get_data(list<RGWBulkDelete::acct_path_t>& items,
+                                           bool * const is_truncated)
+{
+  const size_t MAX_LINE_SIZE = 2048;
+
+  RGWClientIOStreamBuf ciosb(*s->cio, (size_t)s->cct->_conf->rgw_max_chunk_size);
+  istream cioin(&ciosb);
+
+  char buf[MAX_LINE_SIZE];
+  while (cioin.getline(buf, sizeof(buf))) {
+    string path_str(buf);
+
+    ldout(s->cct, 20) << "extracted Bulk Delete entry: " << path_str << dendl;
+
+    RGWBulkDelete::acct_path_t path;
+
+    const size_t sep_pos = path_str.find('/');
+    if (string::npos == sep_pos) {
+      url_decode(path_str, path.bucket_name);
+    } else {
+      string bucket_name;
+      url_decode(path_str.substr(0, sep_pos), bucket_name);
+
+      string obj_name;
+      url_decode(path_str.substr(sep_pos + 1), obj_name);
+
+      path.bucket_name = bucket_name;
+      path.obj_key = obj_name;
+    }
+
+    items.push_back(path);
+
+    if (items.size() == MAX_CHUNK_ENTRIES) {
+      *is_truncated = true;
+      return 0;
+    }
+  }
+
+  *is_truncated = false;
+  return 0;
+}
+
+void RGWBulkDelete_ObjStore_SWIFT::send_response()
+{
+  set_req_state_err(s, ret);
+  dump_errno(s);
+  end_header(s, NULL);
+
+  bulkdelete_respond(deleter->get_num_deleted(),
+                     deleter->get_num_unfound(),
+                     deleter->get_failures(),
+                     s->prot_flags,
+                     *s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
 RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_get()
 {
   return new RGWListBuckets_ObjStore_SWIFT;
@@ -945,9 +1133,20 @@ RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_head()
 
 RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_post()
 {
+  if (s->info.args.exists("bulk-delete")) {
+    return new RGWBulkDelete_ObjStore_SWIFT;
+  }
   return new RGWPutMetadataAccount_ObjStore_SWIFT;
 }
 
+RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_delete()
+{
+  if (s->info.args.exists("bulk-delete")) {
+    return new RGWBulkDelete_ObjStore_SWIFT;
+  }
+  return NULL;
+}
+
 RGWOp *RGWHandler_ObjStore_Bucket_SWIFT::get_obj_op(bool get_data)
 {
   if (is_acl_op()) {
@@ -1197,9 +1396,11 @@ int RGWHandler_ObjStore_SWIFT::init_from_header(struct req_state *s)
   if (first.size() == 0)
     return 0;
 
-  s->bucket_name_str = first;
+  s->info.effective_uri = "/" + first;
 
-  s->info.effective_uri = "/" + s->bucket_name_str;
+  /* XXX Temporarily not parsing URL until Auth puts something in there. */
+  s->bucket_tenant = s->user.user_id.tenant;
+  s->bucket_name = first;
 
   if (req.size()) {
     s->object = rgw_obj_key(req, s->info.env->get("HTTP_X_OBJECT_VERSION_ID", "")); /* rgw swift extension */
@@ -1211,20 +1412,26 @@ int RGWHandler_ObjStore_SWIFT::init_from_header(struct req_state *s)
 
 int RGWHandler_ObjStore_SWIFT::init(RGWRados *store, struct req_state *s, RGWClientIO *cio)
 {
-  dout(10) << "s->object=" << (!s->object.empty() ? s->object : rgw_obj_key("<NULL>")) << " s->bucket=" << (!s->bucket_name_str.empty() ? s->bucket_name_str : "<NULL>") << dendl;
+  dout(10) << "s->object=" << (!s->object.empty() ? s->object : rgw_obj_key("<NULL>"))
+           << " s->bucket=" << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) << dendl;
 
-  int ret = validate_bucket_name(s->bucket_name_str.c_str());
+  int ret;
+  ret = validate_tenant_name(s->bucket_tenant);
+  if (ret)
+    return ret;
+  ret = validate_bucket_name(s->bucket_name);
   if (ret)
     return ret;
   ret = validate_object_name(s->object.name);
   if (ret)
     return ret;
 
-  s->copy_source = s->info.env->get("HTTP_X_COPY_FROM");
-  if (s->copy_source) {
-    bool result = RGWCopyObj::parse_copy_location(s->copy_source, s->src_bucket_name, s->src_object);
+  const char *copy_source = s->info.env->get("HTTP_X_COPY_FROM");
+  if (copy_source) {
+    bool result = RGWCopyObj::parse_copy_location(copy_source, s->src_bucket_name, s->src_object);
     if (!result)
        return -ERR_BAD_URL;
+    s->src_tenant_name = s->user.user_id.tenant;
   }
 
   s->dialect = "swift";
@@ -1234,23 +1441,30 @@ int RGWHandler_ObjStore_SWIFT::init(RGWRados *store, struct req_state *s, RGWCli
     if (!req_dest)
       return -ERR_BAD_URL;
 
-    string dest_bucket_name;
+    string dest_tenant_name, dest_bucket_name;
     rgw_obj_key dest_obj_key;
     bool result = RGWCopyObj::parse_copy_location(req_dest, dest_bucket_name, dest_obj_key);
     if (!result)
        return -ERR_BAD_URL;
+    dest_tenant_name = s->user.user_id.tenant;
 
     string dest_object = dest_obj_key.name;
-    if (dest_bucket_name != s->bucket_name_str) {
-      ret = validate_bucket_name(dest_bucket_name.c_str());
+    if (dest_bucket_name != s->bucket_name) {
+      ret = validate_bucket_name(dest_bucket_name);
       if (ret < 0)
         return ret;
     }
 
+    ret = validate_tenant_name(dest_tenant_name);
+    if (ret < 0)
+      return ret;
+
     /* convert COPY operation into PUT */
-    s->src_bucket_name = s->bucket_name_str;
+    s->src_tenant_name = s->bucket_tenant;
+    s->src_bucket_name = s->bucket_name;
     s->src_object = s->object;
-    s->bucket_name_str = dest_bucket_name;
+    s->bucket_tenant = dest_tenant_name;
+    s->bucket_name = dest_bucket_name;
     s->object = rgw_obj_key(dest_object);
     s->op = OP_PUT;
   }
@@ -1265,7 +1479,7 @@ RGWHandler *RGWRESTMgr_SWIFT::get_handler(struct req_state *s)
   if (ret < 0)
     return NULL;
 
-  if (s->bucket_name_str.empty())
+  if (s->bucket_name.empty())
     return new RGWHandler_ObjStore_Service_SWIFT;
   if (s->object.empty())
     return new RGWHandler_ObjStore_Bucket_SWIFT;
diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h
index 66d8c81..a64c996 100644
--- a/src/rgw/rgw_rest_swift.h
+++ b/src/rgw/rgw_rest_swift.h
@@ -125,6 +125,7 @@ public:
   RGWDeleteObj_ObjStore_SWIFT() {}
   ~RGWDeleteObj_ObjStore_SWIFT() {}
 
+  int get_params();
   bool need_object_expiration() { return true; }
   void send_response();
 };
@@ -167,6 +168,16 @@ public:
   void send_response();
 };
 
+class RGWBulkDelete_ObjStore_SWIFT : public RGWBulkDelete_ObjStore {
+public:
+  RGWBulkDelete_ObjStore_SWIFT() {}
+  ~RGWBulkDelete_ObjStore_SWIFT() {}
+
+  int get_data(std::list<RGWBulkDelete::acct_path_t>& items,
+               bool * is_truncated);
+  void send_response();
+};
+
 class RGWHandler_ObjStore_SWIFT : public RGWHandler_ObjStore {
   friend class RGWRESTMgr_SWIFT;
 protected:
@@ -193,6 +204,7 @@ protected:
   RGWOp *op_get();
   RGWOp *op_head();
   RGWOp *op_post();
+  RGWOp *op_delete();
 public:
   RGWHandler_ObjStore_Service_SWIFT() {}
   virtual ~RGWHandler_ObjStore_Service_SWIFT() {}
diff --git a/src/rgw/rgw_rest_usage.cc b/src/rgw/rgw_rest_usage.cc
index 8472297..6073429 100644
--- a/src/rgw/rgw_rest_usage.cc
+++ b/src/rgw/rgw_rest_usage.cc
@@ -25,12 +25,14 @@ public:
 void RGWOp_Usage_Get::execute() {
   map<std::string, bool> categories;
 
-  string uid;
+  string uid_str;
   uint64_t start, end;
   bool show_entries;
   bool show_summary;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_epoch(s, "start", 0, &start);
   RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end);
   RESTArgs::get_bool(s, "show-entries", true, &show_entries);
@@ -65,10 +67,12 @@ public:
 };
 
 void RGWOp_Usage_Delete::execute() {
-  string uid;
+  string uid_str;
   uint64_t start, end;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_epoch(s, "start", 0, &start);
   RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end);
 
diff --git a/src/rgw/rgw_rest_user.cc b/src/rgw/rgw_rest_user.cc
index 6cd2591..6086e76 100644
--- a/src/rgw/rgw_rest_user.cc
+++ b/src/rgw/rgw_rest_user.cc
@@ -29,10 +29,12 @@ void RGWOp_User_Info::execute()
 {
   RGWUserAdminOpState op_state;
 
-  std::string uid;
+  std::string uid_str;
   bool fetch_stats;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_bool(s, "stats", false, &fetch_stats);
 
   op_state.set_user_id(uid);
@@ -57,7 +59,7 @@ public:
 
 void RGWOp_User_Create::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string display_name;
   std::string email;
   std::string access_key;
@@ -75,7 +77,9 @@ void RGWOp_User_Create::execute()
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "display-name", display_name, &display_name);
   RESTArgs::get_string(s, "email", email, &email);
   RESTArgs::get_string(s, "access-key", access_key, &access_key);
@@ -157,7 +161,7 @@ public:
 
 void RGWOp_User_Modify::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string display_name;
   std::string email;
   std::string access_key;
@@ -173,7 +177,9 @@ void RGWOp_User_Modify::execute()
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "display-name", display_name, &display_name);
   RESTArgs::get_string(s, "email", email, &email);
   RESTArgs::get_string(s, "access-key", access_key, &access_key);
@@ -251,12 +257,14 @@ public:
 
 void RGWOp_User_Remove::execute()
 {
-  std::string uid;
+  std::string uid_str;
   bool purge_data;
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_bool(s, "purge-data", false, &purge_data);
 
   // FIXME: no double checking
@@ -284,7 +292,7 @@ public:
 
 void RGWOp_Subuser_Create::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string subuser;
   std::string secret_key;
   std::string perm_str;
@@ -298,7 +306,9 @@ void RGWOp_Subuser_Create::execute()
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "subuser", subuser, &subuser);
   RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
   RESTArgs::get_string(s, "access", perm_str, &perm_str);
@@ -351,7 +361,7 @@ public:
 
 void RGWOp_Subuser_Modify::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string subuser;
   std::string secret_key;
   std::string key_type_str;
@@ -364,7 +374,9 @@ void RGWOp_Subuser_Modify::execute()
 
   bool gen_secret;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "subuser", subuser, &subuser);
   RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
   RESTArgs::get_string(s, "access", perm_str, &perm_str);
@@ -414,13 +426,15 @@ public:
 
 void RGWOp_Subuser_Remove::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string subuser;
   bool purge_keys;
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "subuser", subuser, &subuser);
   RESTArgs::get_bool(s, "purge-keys", true, &purge_keys);
 
@@ -453,7 +467,7 @@ public:
 
 void RGWOp_Key_Create::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string subuser;
   std::string access_key;
   std::string secret_key;
@@ -463,7 +477,9 @@ void RGWOp_Key_Create::execute()
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "subuser", subuser, &subuser);
   RESTArgs::get_string(s, "access-key", access_key, &access_key);
   RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
@@ -515,14 +531,16 @@ public:
 
 void RGWOp_Key_Remove::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string subuser;
   std::string access_key;
   std::string key_type_str;
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "subuser", subuser, &subuser);
   RESTArgs::get_string(s, "access-key", access_key, &access_key);
   RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
@@ -566,12 +584,14 @@ public:
 
 void RGWOp_Caps_Add::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string caps;
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "user-caps", caps, &caps);
 
   // FIXME: no double checking
@@ -600,12 +620,14 @@ public:
 
 void RGWOp_Caps_Remove::execute()
 {
-  std::string uid;
+  std::string uid_str;
   std::string caps;
 
   RGWUserAdminOpState op_state;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
   RESTArgs::get_string(s, "user-caps", caps, &caps);
 
   // FIXME: no double checking
@@ -656,17 +678,19 @@ void RGWOp_Quota_Info::execute()
 {
   RGWUserAdminOpState op_state;
 
-  std::string uid;
+  std::string uid_str;
   std::string quota_type;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
   RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
 
-  if (uid.empty()) {
+  if (uid_str.empty()) {
     http_ret = -EINVAL;
     return;
   }
 
+  rgw_user uid(uid_str);
+
   bool show_all = quota_type.empty();
   bool show_bucket = show_all || (quota_type == "bucket");
   bool show_user = show_all || (quota_type == "user");
@@ -768,17 +792,19 @@ void RGWOp_Quota_Set::execute()
 {
   RGWUserAdminOpState op_state;
 
-  std::string uid;
+  std::string uid_str;
   std::string quota_type;
 
-  RESTArgs::get_string(s, "uid", uid, &uid);
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
   RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
 
-  if (uid.empty()) {
+  if (uid_str.empty()) {
     http_ret = -EINVAL;
     return;
   }
 
+  rgw_user uid(uid_str);
+
   bool set_all = quota_type.empty();
   bool set_bucket = set_all || (quota_type == "bucket");
   bool set_user = set_all || (quota_type == "user");
diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc
index 0a8d373..8d73602 100644
--- a/src/rgw/rgw_swift.cc
+++ b/src/rgw/rgw_swift.cc
@@ -542,7 +542,7 @@ int RGWSwift::validate_keystone_token(RGWRados *store, const string& token, stru
 int authenticate_temp_url(RGWRados *store, req_state *s)
 {
   /* temp url requires bucket and object specified in the requets */
-  if (s->bucket_name_str.empty())
+  if (s->bucket_name.empty())
     return -EPERM;
 
   if (s->object.empty())
@@ -559,7 +559,9 @@ int authenticate_temp_url(RGWRados *store, req_state *s)
   /* need to get user info of bucket owner */
   RGWBucketInfo bucket_info;
 
-  int ret = store->get_bucket_info(*static_cast<RGWObjectCtx *>(s->obj_ctx), s->bucket_name_str, bucket_info, NULL);
+  int ret = store->get_bucket_info(*static_cast<RGWObjectCtx *>(s->obj_ctx),
+                                   s->bucket_tenant, s->bucket_name,
+                                   bucket_info, NULL);
   if (ret < 0)
     return -EPERM;
 
@@ -638,8 +640,8 @@ bool RGWSwift::verify_swift_token(RGWRados *store, req_state *s)
     s->perm_mask = 0;
     map<string, RGWSubUser>::iterator iter = s->user.subusers.find(subuser);
     if (iter != s->user.subusers.end()) {
-      RGWSubUser& subuser = iter->second;
-      s->perm_mask = subuser.perm_mask;
+      RGWSubUser& subuser_ = iter->second;
+      s->perm_mask = subuser_.perm_mask;
     }
   } else {
     s->perm_mask = RGW_PERM_FULL_CONTROL;
@@ -684,7 +686,7 @@ bool RGWSwift::do_verify_swift_token(RGWRados *store, req_state *s)
     return false;
   }
 
-  s->swift_user = info.user;
+  s->swift_user = info.user.to_str();
   s->swift_groups = info.auth_groups;
 
   string swift_user = s->swift_user;
diff --git a/src/rgw/rgw_swift.h b/src/rgw/rgw_swift.h
index efc8d71..63596e0 100644
--- a/src/rgw/rgw_swift.h
+++ b/src/rgw/rgw_swift.h
@@ -14,7 +14,7 @@ class KeystoneToken;
 struct rgw_swift_auth_info {
   int status;
   string auth_groups;
-  string user;
+  rgw_user user;
   string display_name;
   long long ttl;
 
diff --git a/src/rgw/rgw_usage.cc b/src/rgw/rgw_usage.cc
index 486d96a..f8495c5 100644
--- a/src/rgw/rgw_usage.cc
+++ b/src/rgw/rgw_usage.cc
@@ -30,7 +30,7 @@ static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log
   formatter->close_section(); // categories
 }
 
-int RGWUsage::show(RGWRados *store, string& uid, uint64_t start_epoch,
+int RGWUsage::show(RGWRados *store, rgw_user& uid, uint64_t start_epoch,
 		   uint64_t end_epoch, bool show_log_entries, bool show_log_sum,
 		   map<string, bool> *categories,
 		   RGWFormatterFlusher& flusher)
@@ -135,7 +135,7 @@ int RGWUsage::show(RGWRados *store, string& uid, uint64_t start_epoch,
   return 0;
 }
 
-int RGWUsage::trim(RGWRados *store, string& uid, uint64_t start_epoch,
+int RGWUsage::trim(RGWRados *store, rgw_user& uid, uint64_t start_epoch,
 		   uint64_t end_epoch)
 {
   return store->trim_usage(uid, start_epoch, end_epoch);
diff --git a/src/rgw/rgw_usage.h b/src/rgw/rgw_usage.h
index 89dfdf7..e2de1a9 100644
--- a/src/rgw/rgw_usage.h
+++ b/src/rgw/rgw_usage.h
@@ -16,12 +16,12 @@ class RGWRados;
 class RGWUsage
 {
 public:
-  static int show(RGWRados *store, std::string& uid, uint64_t start_epoch,
+  static int show(RGWRados *store, rgw_user& uid, uint64_t start_epoch,
 	          uint64_t end_epoch, bool show_log_entries, bool show_log_sum,
 		  std::map<std::string, bool> *categories,
 	          RGWFormatterFlusher& flusher);
 
-  static int trim(RGWRados *store, std::string& uid, uint64_t start_epoch,
+  static int trim(RGWRados *store, rgw_user& uid, uint64_t start_epoch,
 	          uint64_t end_epoch);
 };
 
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 9a115a2..5063cd0 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -42,10 +42,10 @@ void rgw_get_anon_user(RGWUserInfo& info)
 
 bool rgw_user_is_authenticated(RGWUserInfo& info)
 {
-  return (info.user_id != RGW_USER_ANON_ID);
+  return (info.user_id.id != RGW_USER_ANON_ID);
 }
 
-int rgw_user_sync_all_stats(RGWRados *store, const string& user_id)
+int rgw_user_sync_all_stats(RGWRados *store, const rgw_user& user_id)
 {
   CephContext *cct = store->ctx();
   size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
@@ -148,6 +148,8 @@ int rgw_store_user_info(RGWRados *store,
 
   RGWUID ui;
   ui.user_id = info.user_id;
+  // P3
+  ldout(store->ctx(), 0) << "DEBUG: rgw_store_user_info: user_id " << ui.user_id << dendl;
 
   bufferlist link_bl;
   ::encode(ui, link_bl);
@@ -156,7 +158,10 @@ int rgw_store_user_info(RGWRados *store,
   ::encode(ui, data_bl);
   ::encode(info, data_bl);
 
-  ret = store->meta_mgr->put_entry(user_meta_handler, info.user_id, data_bl, exclusive, &ot, mtime, pattrs);
+  string key;
+  info.user_id.to_str(key);
+
+  ret = store->meta_mgr->put_entry(user_meta_handler, key, data_bl, exclusive, &ot, mtime, pattrs);
   if (ret < 0)
     return ret;
 
@@ -275,7 +280,7 @@ int rgw_get_user_info_from_index(RGWRados *store, string& key, rgw_bucket& bucke
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
 int rgw_get_user_info_by_uid(RGWRados *store,
-                             string& uid,
+                             rgw_user& uid,
                              RGWUserInfo& info,
                              RGWObjVersionTracker *objv_tracker,
                              time_t *pmtime,
@@ -286,7 +291,8 @@ int rgw_get_user_info_by_uid(RGWRados *store,
   RGWUID user_id;
 
   RGWObjectCtx obj_ctx(store);
-  int ret = rgw_get_system_obj(store, obj_ctx, store->zone.user_uid_pool, uid, bl, objv_tracker, pmtime, pattrs, cache_info);
+  string oid = uid.to_str();
+  int ret = rgw_get_system_obj(store, obj_ctx, store->zone.user_uid_pool, oid, bl, objv_tracker, pmtime, pattrs, cache_info);
   if (ret < 0) {
     return ret;
   }
@@ -340,12 +346,12 @@ extern int rgw_get_user_info_by_access_key(RGWRados *store, string& access_key,
 }
 
 int rgw_get_user_attrs_by_uid(RGWRados *store,
-                              const string& user_id,
+                              const rgw_user& user_id,
                               map<string, bufferlist>& attrs,
                               RGWObjVersionTracker *objv_tracker)
 {
   RGWObjectCtx obj_ctx(store);
-  rgw_obj obj(store->zone.user_uid_pool, user_id);
+  rgw_obj obj(store->zone.user_uid_pool, user_id.to_str());
   RGWRados::SystemObject src(store, obj_ctx, obj);
   RGWRados::SystemObject::Read rop(&src);
 
@@ -360,7 +366,7 @@ int rgw_remove_key_index(RGWRados *store, RGWAccessKey& access_key)
   return ret;
 }
 
-int rgw_remove_uid_index(RGWRados *store, string& uid)
+int rgw_remove_uid_index(RGWRados *store, rgw_user& uid)
 {
   RGWObjVersionTracker objv_tracker;
   RGWUserInfo info;
@@ -368,7 +374,8 @@ int rgw_remove_uid_index(RGWRados *store, string& uid)
   if (ret < 0)
     return ret;
 
-  ret = store->meta_mgr->remove_entry(user_meta_handler, uid, &objv_tracker);
+  string oid = uid.to_str();
+  ret = store->meta_mgr->remove_entry(user_meta_handler, oid, &objv_tracker);
   if (ret < 0)
     return ret;
 
@@ -462,10 +469,13 @@ int rgw_delete_user(RGWRados *store, RGWUserInfo& info, RGWObjVersionTracker& ob
     ldout(store->ctx(), 0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed (err=" << ret << ")" << dendl;
     return ret;
   }
+
+  string key;
+  info.user_id.to_str(key);
   
-  rgw_obj uid_obj(store->zone.user_uid_pool, info.user_id);
+  rgw_obj uid_obj(store->zone.user_uid_pool, key);
   ldout(store->ctx(), 10) << "removing user index: " << info.user_id << dendl;
-  ret = store->meta_mgr->remove_entry(user_meta_handler, info.user_id, &objv_tracker);
+  ret = store->meta_mgr->remove_entry(user_meta_handler, key, &objv_tracker);
   if (ret < 0 && ret != -ENOENT && ret  != -ECANCELED) {
     ldout(store->ctx(), 0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl;
     return ret;
@@ -571,10 +581,15 @@ static bool remove_old_indexes(RGWRados *store,
   int ret;
   bool success = true;
 
-  if (!old_info.user_id.empty() && old_info.user_id.compare(new_info.user_id) != 0) {
+  if (!old_info.user_id.empty() &&
+      old_info.user_id.compare(new_info.user_id) != 0) {
+    if (old_info.user_id.tenant != new_info.user_id.tenant) {
+      ldout(store->ctx(), 0) << "ERROR: tenant mismatch: " << old_info.user_id.tenant << " != " << new_info.user_id.tenant << dendl;
+      return false;
+    }
     ret = rgw_remove_uid_index(store, old_info.user_id);
     if (ret < 0 && ret != -ENOENT) {
-      set_err_msg(err_msg, "ERROR: could not remove index for uid " + old_info.user_id);
+      set_err_msg(err_msg, "ERROR: could not remove index for uid " + old_info.user_id.to_str());
       success = false;
     }
   }
@@ -619,7 +634,9 @@ static void dump_subusers_info(Formatter *f, RGWUserInfo &info)
   for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
     RGWSubUser& u = uiter->second;
     f->open_object_section("user");
-    f->dump_format("id", "%s:%s", info.user_id.c_str(), u.name.c_str());
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str());
     char buf[256];
     rgw_perm_to_str(u.perm_mask, buf, sizeof(buf));
     f->dump_string("permissions", buf);
@@ -637,7 +654,9 @@ static void dump_access_keys_info(Formatter *f, RGWUserInfo &info)
     const char *sep = (k.subuser.empty() ? "" : ":");
     const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
     f->open_object_section("key");
-    f->dump_format("user", "%s%s%s", info.user_id.c_str(), sep, subuser);
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
     f->dump_string("access_key", k.id);
     f->dump_string("secret_key", k.key);
     f->close_section();
@@ -654,7 +673,9 @@ static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info)
     const char *sep = (k.subuser.empty() ? "" : ":");
     const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
     f->open_object_section("key");
-    f->dump_format("user", "%s%s%s", info.user_id.c_str(), sep, subuser);
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
     f->dump_string("secret_key", k.key);
     f->close_section();
   }
@@ -666,7 +687,8 @@ static void dump_user_info(Formatter *f, RGWUserInfo &info,
 {
   f->open_object_section("user_info");
 
-  f->dump_string("user_id", info.user_id);
+  f->dump_string("tenant", info.user_id.tenant);
+  f->dump_string("user_id", info.user_id.id);
   f->dump_string("display_name", info.display_name);
   f->dump_string("email", info.user_email);
   f->dump_int("suspended", (int)info.suspended);
@@ -713,7 +735,7 @@ int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state)
     return -EINVAL;
   }
 
-  std::string uid = op_state.get_user_id();
+  rgw_user& uid = op_state.get_user_id();
   if (uid.compare(RGW_USER_ANON_ID) == 0) {
     keys_allowed = false;
     return -EACCES;
@@ -1197,7 +1219,7 @@ int RGWSubUserPool::init(RGWUserAdminOpState& op_state)
     return -EINVAL;
   }
 
-  std::string uid = op_state.get_user_id();
+  rgw_user& uid = op_state.get_user_id();
   if (uid.compare(RGW_USER_ANON_ID) == 0) {
     subusers_allowed = false;
     return -EACCES;
@@ -1490,8 +1512,8 @@ int RGWUserCapPool::init(RGWUserAdminOpState& op_state)
     return -EINVAL;
   }
 
-  std::string uid = op_state.get_user_id();
-  if (uid == RGW_USER_ANON_ID) {
+  rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
     caps_allowed = false;
     return -EACCES;
   }
@@ -1642,7 +1664,7 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
 {
   bool found = false;
   std::string swift_user;
-  std::string uid = op_state.get_user_id();
+  rgw_user& uid = op_state.get_user_id();
   std::string user_email = op_state.get_user_email();
   std::string access_key = op_state.get_access_key();
   std::string subuser = op_state.get_subuser();
@@ -1763,7 +1785,7 @@ int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
   bool same_id;
   bool populated;
   //bool existing_email = false; // this check causes a fault
-  std::string op_id = op_state.get_user_id();
+  rgw_user& op_id = op_state.get_user_id();
   std::string op_email = op_state.get_user_email();
 
   RGWUserInfo user_info;
@@ -1777,8 +1799,8 @@ int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
   }
 
   if (populated && !same_id) {
-    set_err_msg(err_msg, "user id mismatch, operation id: " + op_id\
-            + " does not match: " + user_id);
+    set_err_msg(err_msg, "user id mismatch, operation id: " + op_id.to_str()
+            + " does not match: " + user_id.to_str());
 
     return -EINVAL;
   }
@@ -1800,7 +1822,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
 
   RGWUserInfo user_info;
 
-  std::string uid = op_state.get_user_id();
+  rgw_user& uid = op_state.get_user_id();
   std::string user_email = op_state.get_user_email();
   std::string display_name = op_state.get_display_name();
 
@@ -1817,7 +1839,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
     } else if (op_state.found_by_key) {
       set_err_msg(err_msg, "duplicate key provided");
     } else {
-      set_err_msg(err_msg, "user: " + op_state.user_id + " exists");
+      set_err_msg(err_msg, "user: " + op_state.user_id.to_str() + " exists");
     }
     return -EEXIST;
   }
@@ -1931,7 +1953,7 @@ int RGWUser::execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg)
   int ret;
 
   bool purge_data = op_state.will_purge_data();
-  std::string uid = op_state.get_user_id();
+  rgw_user& uid = op_state.get_user_id();
   RGWUserInfo user_info = op_state.get_user_info();
 
   if (!op_state.has_existing_user()) {
@@ -1959,7 +1981,7 @@ int RGWUser::execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg)
 
     std::map<std::string, RGWBucketEnt>::iterator it;
     for (it = m.begin(); it != m.end(); ++it) {
-      ret = rgw_remove_bucket(store, uid, ((*it).second).bucket, true);
+      ret = rgw_remove_bucket(store, ((*it).second).bucket, true);
       if (ret < 0) {
         set_err_msg(err_msg, "unable to delete user data");
         return ret;
@@ -2030,7 +2052,7 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
   }
 
   // ensure that we can modify the user's attributes
-  if (user_id == RGW_USER_ANON_ID) {
+  if (user_id.compare(RGW_USER_ANON_ID) == 0) {
     set_err_msg(err_msg, "unable to modify anonymous user's info");
     return -EACCES;
   }
@@ -2042,7 +2064,7 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
     // make sure we are not adding a duplicate email
     if (old_email.compare(op_email) != 0) {
       ret = rgw_get_user_info_by_email(store, op_email, duplicate_check);
-      if (ret >= 0 && duplicate_check.user_id != user_id) {
+      if (ret >= 0 && duplicate_check.user_id.compare(user_id) != 0) {
         set_err_msg(err_msg, "cannot add duplicate email");
         return -EEXIST;
       }
@@ -2096,7 +2118,7 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
     do {
       ret = rgw_read_user_buckets(store, user_id, buckets, marker, max_buckets, false);
       if (ret < 0) {
-        set_err_msg(err_msg, "could not get buckets for uid:  " + user_id);
+        set_err_msg(err_msg, "could not get buckets for uid:  " + user_id.to_str());
         return ret;
       }
 
@@ -2512,7 +2534,9 @@ public:
     RGWObjVersionTracker objv_tracker;
     time_t mtime;
 
-    int ret = rgw_get_user_info_by_uid(store, entry, uci.info, &objv_tracker,
+    rgw_user uid(entry);
+
+    int ret = rgw_get_user_info_by_uid(store, uid, uci.info, &objv_tracker,
                                        &mtime, NULL, &uci.attrs);
     if (ret < 0) {
       return ret;
@@ -2535,9 +2559,11 @@ public:
       pattrs = &uci.attrs;
     }
 
+    rgw_user uid(entry);
+
     RGWUserInfo old_info;
     time_t orig_mtime;
-    int ret = rgw_get_user_info_by_uid(store, entry, old_info, &objv_tracker, &orig_mtime);
+    int ret = rgw_get_user_info_by_uid(store, uid, old_info, &objv_tracker, &orig_mtime);
     if (ret < 0 && ret != -ENOENT)
       return ret;
 
@@ -2563,7 +2589,10 @@ public:
 
   int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) {
     RGWUserInfo info;
-    int ret = rgw_get_user_info_by_uid(store, entry, info, &objv_tracker);
+
+    rgw_user uid(entry);
+
+    int ret = rgw_get_user_info_by_uid(store, uid, info, &objv_tracker);
     if (ret < 0)
       return ret;
 
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 06890fc..6e877fe 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -31,17 +31,21 @@ using namespace std;
  */
 struct RGWUID
 {
-  string user_id;
+  rgw_user user_id;
   void encode(bufferlist& bl) const {
-    ::encode(user_id, bl);
+    string s;
+    user_id.to_str(s);
+    ::encode(s, bl);
   }
   void decode(bufferlist::iterator& bl) {
-    ::decode(user_id, bl);
+    string s;
+    ::decode(s, bl);
+    user_id.from_str(s);
   }
 };
 WRITE_CLASS_ENCODER(RGWUID)
 
-extern int rgw_user_sync_all_stats(RGWRados *store, const string& user_id);
+extern int rgw_user_sync_all_stats(RGWRados *store, const rgw_user& user_id);
 /**
  * Get the anonymous (ie, unauthenticated) user info.
  */
@@ -78,7 +82,7 @@ extern int rgw_store_user_attrs(RGWRados *store,
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
 extern int rgw_get_user_info_by_uid(RGWRados *store,
-                                    string& user_id,
+                                    rgw_user& user_id,
                                     RGWUserInfo& info,
                                     RGWObjVersionTracker *objv_tracker = NULL,
                                     time_t *pmtime                     = NULL,
@@ -108,7 +112,7 @@ extern int rgw_get_user_info_by_access_key(RGWRados *store, string& access_key,
  * Returns: 0 on success, -ERR# on failure.
  */
 extern int rgw_get_user_attrs_by_uid(RGWRados *store,
-                                     const string& user_id,
+                                     const rgw_user& user_id,
                                      map<string, bufferlist>& attrs,
                                      RGWObjVersionTracker *objv_tracker = NULL);
 /**
@@ -123,7 +127,7 @@ extern int rgw_delete_user(RGWRados *store, RGWUserInfo& user, RGWObjVersionTrac
  * remove the different indexes
  */
 extern int rgw_remove_key_index(RGWRados *store, RGWAccessKey& access_key);
-extern int rgw_remove_uid_index(RGWRados *store, string& uid);
+extern int rgw_remove_uid_index(RGWRados *store, rgw_user& uid);
 extern int rgw_remove_email_index(RGWRados *store, string& email);
 extern int rgw_remove_swift_name_index(RGWRados *store, string& swift_name);
 
@@ -156,7 +160,7 @@ enum RGWUserId {
 struct RGWUserAdminOpState {
   // user attributes
   RGWUserInfo info;
-  std::string user_id;
+  rgw_user user_id;
   std::string user_email;
   std::string display_name;
   uint32_t max_buckets;
@@ -238,7 +242,7 @@ struct RGWUserAdminOpState {
     gen_secret = false;
     key_op = true;
   }
-  void set_user_id(std::string& id) {
+  void set_user_id(rgw_user& id) {
     if (id.empty())
       return;
 
@@ -265,7 +269,7 @@ struct RGWUserAdminOpState {
     size_t pos = _subuser.find(":");
 
     if (pos != string::npos) {
-      user_id = _subuser.substr(0, pos);
+      user_id.id = _subuser.substr(0, pos);
       subuser = _subuser.substr(pos+1);
     } else {
       subuser = _subuser;
@@ -389,7 +393,7 @@ struct RGWUserAdminOpState {
   RGWQuotaInfo& get_bucket_quota() { return bucket_quota; }
   RGWQuotaInfo& get_user_quota() { return user_quota; }
 
-  std::string get_user_id() { return user_id; }
+  rgw_user& get_user_id() { return user_id; }
   std::string get_subuser() { return subuser; }
   std::string get_access_key() { return id; }
   std::string get_secret_key() { return key; }
@@ -410,7 +414,8 @@ struct RGWUserAdminOpState {
     if (user_id.empty() || subuser.empty())
       return "";
 
-    std::string kid = user_id;
+    std::string kid;
+    user_id.to_str(kid);
     kid.append(":");
     kid.append(subuser);
 
@@ -421,7 +426,8 @@ struct RGWUserAdminOpState {
     if (user_id.empty())
       return "";
 
-    std::string generated_subuser = user_id;
+    std::string generated_subuser;
+    user_id.to_str(generated_subuser);
     std::string rand_suffix;
 
     int sub_buf_size = RAND_SUBUSER_LEN + 1;
@@ -440,7 +446,7 @@ struct RGWUserAdminOpState {
     return generated_subuser;
   }
 
-  RGWUserAdminOpState() : user_id(RGW_USER_ANON_ID), user_email(""), display_name(""), id(""), key ("")
+  RGWUserAdminOpState() : user_id(RGW_USER_ANON_ID)
   {
     max_buckets = RGW_DEFAULT_MAX_BUCKETS;
     key_type = -1;
@@ -495,7 +501,7 @@ class RGWAccessKeyPool
   RGWUser *user;
 
   std::map<std::string, int, ltstr_nocase> key_type_map;
-  std::string user_id;
+  rgw_user user_id;
   RGWRados *store;
 
   map<std::string, RGWAccessKey> *swift_keys;
@@ -537,7 +543,7 @@ class RGWSubUserPool
 {
   RGWUser *user;
 
-  string user_id;
+  rgw_user user_id;
   RGWRados *store;
   bool subusers_allowed;
 
@@ -599,7 +605,7 @@ private:
   RGWUserInfo old_info;
   RGWRados *store;
 
-  string user_id;
+  rgw_user user_id;
   bool info_stored;
 
   void set_populated() { info_stored = true; }
diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am
index 8a9ae6e..d44a977 100644
--- a/src/test/Makefile-client.am
+++ b/src/test/Makefile-client.am
@@ -352,7 +352,23 @@ noinst_LTLIBRARIES += librbd_test.la
 
 unittest_librbd_SOURCES = \
         test/librbd/test_main.cc \
-	test/librbd/test_mock_fixture.cc
+	test/librbd/test_mock_fixture.cc \
+	test/librbd/test_mock_ExclusiveLock.cc \
+	test/librbd/exclusive_lock/test_mock_AcquireRequest.cc \
+	test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc \
+	test/librbd/object_map/test_mock_InvalidateRequest.cc \
+	test/librbd/object_map/test_mock_LockRequest.cc \
+	test/librbd/object_map/test_mock_RefreshRequest.cc \
+	test/librbd/object_map/test_mock_ResizeRequest.cc \
+	test/librbd/object_map/test_mock_SnapshotCreateRequest.cc \
+	test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc \
+	test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc \
+	test/librbd/object_map/test_mock_UnlockRequest.cc \
+	test/librbd/object_map/test_mock_UpdateRequest.cc \
+	test/librbd/operation/test_mock_SnapshotCreateRequest.cc \
+	test/librbd/operation/test_mock_SnapshotProtectRequest.cc \
+	test/librbd/operation/test_mock_SnapshotRemoveRequest.cc \
+	test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
 unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 unittest_librbd_LDADD = \
 	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
@@ -388,10 +404,15 @@ noinst_HEADERS += \
 	test/librbd/test_fixture.h \
 	test/librbd/test_mock_fixture.h \
 	test/librbd/test_support.h \
+	test/librbd/mock/MockAioImageRequestWQ.h \
 	test/librbd/mock/MockContextWQ.h \
+	test/librbd/mock/MockExclusiveLock.h \
 	test/librbd/mock/MockImageCtx.h \
 	test/librbd/mock/MockImageWatcher.h \
-	test/librbd/mock/MockObjectMap.h
+	test/librbd/mock/MockJournal.h \
+	test/librbd/mock/MockObjectMap.h \
+	test/librbd/mock/MockReadahead.h \
+	test/librbd/object_map/mock/MockInvalidateRequest.h
 
 if LINUX
 ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.cc
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 3d63535..ab66504 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -433,9 +433,11 @@ bin_DEBUGPROGRAMS += ceph_test_objectcacher_stress
 ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
 bin_DEBUGPROGRAMS += ceph_test_cfuse_cache_invalidate
 
+if LINUX
 ceph_test_get_blkdev_size_SOURCES = test/test_get_blkdev_size.cc
 ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 bin_DEBUGPROGRAMS += ceph_test_get_blkdev_size
+endif
 
 noinst_HEADERS += \
 	test/bench/backend.h \
diff --git a/src/test/bench/bencher.cc b/src/test/bench/bencher.cc
index c178146..aebe729a 100644
--- a/src/test/bench/bencher.cc
+++ b/src/test/bench/bencher.cc
@@ -195,7 +195,8 @@ void Bencher::run_bench()
       default: {
 	assert(0);
       }
-    }
+    } 
+    ops++;
   }
   drain_ops();
 }
diff --git a/src/test/bench/bencher.h b/src/test/bench/bencher.h
index a4f6321..28d7433 100644
--- a/src/test/bench/bencher.h
+++ b/src/test/bench/bencher.h
@@ -135,7 +135,7 @@ public:
     boost::tuple<string, uint64_t, uint64_t, Bencher::OpType> ret =
       boost::make_tuple(*object_pos, cur_pos, length, (*op_dist)());
     cur_pos += length;
-    if (cur_pos > size) {
+    if (cur_pos >= size) {
       cur_pos = 0;
       ++object_pos;
     }
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index b05a15a..d2df058 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -44,6 +44,8 @@ static char cmd[128];
 TEST(Buffer, constructors) {
   bool ceph_buffer_track = get_env_bool("CEPH_BUFFER_TRACK");
   unsigned len = 17;
+  uint64_t history_alloc_bytes = 0;
+  uint64_t history_alloc_num = 0;
   //
   // buffer::create
   //
@@ -51,9 +53,14 @@ TEST(Buffer, constructors) {
     EXPECT_EQ(0, buffer::get_total_alloc());
   {
     bufferptr ptr(buffer::create(len));
+    history_alloc_bytes += len;
+    history_alloc_num++;
     EXPECT_EQ(len, ptr.length());
-    if (ceph_buffer_track)
+    if (ceph_buffer_track) {
       EXPECT_EQ(len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
   }
   //
   // buffer::claim_char
@@ -64,11 +71,16 @@ TEST(Buffer, constructors) {
     char* str = new char[len];
     ::memset(str, 'X', len);
     bufferptr ptr(buffer::claim_char(len, str));
-    if (ceph_buffer_track)
+    if (ceph_buffer_track) {
       EXPECT_EQ(len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     EXPECT_EQ(len, ptr.length());
     EXPECT_EQ(str, ptr.c_str());
     bufferptr clone = ptr.clone();
+    history_alloc_bytes += len;
+    history_alloc_num++;
     EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len));
   }
   //
@@ -79,8 +91,11 @@ TEST(Buffer, constructors) {
   {
     char* str = new char[len];
     bufferptr ptr(buffer::create_static(len, str));
-    if (ceph_buffer_track)
+    if (ceph_buffer_track) {
       EXPECT_EQ(0, buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     EXPECT_EQ(len, ptr.length());
     EXPECT_EQ(str, ptr.c_str());
     delete [] str;
@@ -92,8 +107,13 @@ TEST(Buffer, constructors) {
     EXPECT_EQ(0, buffer::get_total_alloc());
   {
     bufferptr ptr(buffer::create_malloc(len));
-    if (ceph_buffer_track)
+    history_alloc_bytes += len;
+    history_alloc_num++;
+    if (ceph_buffer_track) {
       EXPECT_EQ(len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     EXPECT_EQ(len, ptr.length());
     // this doesn't throw on my x86_64 wheezy box --sage
     //EXPECT_THROW(buffer::create_malloc((unsigned)ULLONG_MAX), buffer::bad_alloc);
@@ -107,11 +127,16 @@ TEST(Buffer, constructors) {
     char* str = (char*)malloc(len);
     ::memset(str, 'X', len);
     bufferptr ptr(buffer::claim_malloc(len, str));
-    if (ceph_buffer_track)
+    if (ceph_buffer_track) {
       EXPECT_EQ(len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     EXPECT_EQ(len, ptr.length());
     EXPECT_EQ(str, ptr.c_str());
     bufferptr clone = ptr.clone();
+    history_alloc_bytes += len;
+    history_alloc_num++;
     EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len));
   }
   //
@@ -122,8 +147,13 @@ TEST(Buffer, constructors) {
   {
     const std::string expected(len, 'X');
     bufferptr ptr(buffer::copy(expected.c_str(), expected.size()));
-    if (ceph_buffer_track)
+    history_alloc_bytes += len;
+    history_alloc_num++;
+    if (ceph_buffer_track) {
       EXPECT_EQ(len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     EXPECT_NE(expected.c_str(), ptr.c_str());
     EXPECT_EQ(0, ::memcmp(expected.c_str(), ptr.c_str(), len));
   }
@@ -134,16 +164,27 @@ TEST(Buffer, constructors) {
     EXPECT_EQ(0, buffer::get_total_alloc());
   {
     bufferptr ptr(buffer::create_page_aligned(len));
+    history_alloc_bytes += len;
+    history_alloc_num++;
     ::memset(ptr.c_str(), 'X', len);
-    if (ceph_buffer_track)
+    if (ceph_buffer_track) {
       EXPECT_EQ(len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     // doesn't throw on my x86_64 wheezy box --sage
     //EXPECT_THROW(buffer::create_page_aligned((unsigned)ULLONG_MAX), buffer::bad_alloc);
 #ifndef DARWIN
     ASSERT_TRUE(ptr.is_page_aligned());
 #endif // DARWIN 
     bufferptr clone = ptr.clone();
+    history_alloc_bytes += len;
+    history_alloc_num++;
     EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len));
+    if (ceph_buffer_track) {
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
   }
 #ifdef CEPH_HAVE_SPLICE
   if (ceph_buffer_track)
@@ -151,6 +192,8 @@ TEST(Buffer, constructors) {
   {
     // no fd
     EXPECT_THROW(buffer::create_zero_copy(len, -1, NULL), buffer::error_code);
+    history_alloc_bytes += len;
+    history_alloc_num++;
 
     unsigned zc_len = 4;
     ::unlink(FILENAME);
@@ -158,9 +201,14 @@ TEST(Buffer, constructors) {
     EXPECT_EQ(0, ::system(cmd));
     int fd = ::open(FILENAME, O_RDONLY);
     bufferptr ptr(buffer::create_zero_copy(zc_len, fd, NULL));
+    history_alloc_bytes += zc_len;
+    history_alloc_num++;
     EXPECT_EQ(zc_len, ptr.length());
-    if (ceph_buffer_track)
+    if (ceph_buffer_track) {
       EXPECT_EQ(zc_len, (unsigned)buffer::get_total_alloc());
+      EXPECT_EQ(history_alloc_bytes, buffer::get_history_alloc_bytes());
+      EXPECT_EQ(history_alloc_num, buffer::get_history_alloc_num());
+    }
     ::close(fd);
     ::unlink(FILENAME);
   }
diff --git a/src/test/centos-6/ceph.spec.in b/src/test/centos-6/ceph.spec.in
index 2939fef..52c5c1d 100644
--- a/src/test/centos-6/ceph.spec.in
+++ b/src/test/centos-6/ceph.spec.in
@@ -43,6 +43,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # /var/run/ceph.
 %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
 %global _with_systemd 1
+%{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
 # LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
@@ -106,6 +107,11 @@ BuildRequires:	boost-devel
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
+%if 0%{?suse_version}
+BuildRequires:	python-Cython
+%else
+BuildRequires:	Cython
+%endif
 BuildRequires:	gdbm
 BuildRequires:	hdparm
 BuildRequires:	leveldb-devel > 1.2
@@ -121,6 +127,7 @@ BuildRequires:	parted
 BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-devel
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
@@ -138,6 +145,7 @@ BuildRequires:	yasm
 %if 0%{?_with_systemd}
 BuildRequires:  pkgconfig(systemd)
 BuildRequires:	systemd-rpm-macros
+BuildRequires:	systemd
 %{?systemd_requires}
 %endif
 PreReq:		%fillup_prereq
@@ -253,6 +261,15 @@ Requires:	librbd1 = %{epoch}:%{version}-%{release}
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
+%package -n rbd-nbd
+Summary:	Ceph RBD client base on NBD
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-nbd
+NBD based client to map Ceph rbd images to local device
+
 %package radosgw
 Summary:	Rados REST gateway
 Group:		Development/Libraries
@@ -628,6 +645,10 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
   install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
   install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-osd.target $RPM_BUILD_ROOT%{_unitdir}/ceph-osd.target
+  install -m 0644 -D systemd/ceph-mon.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mon.target
+  install -m 0644 -D systemd/ceph-mds.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mds.target
+  install -m 0644 -D systemd/ceph-radosgw.target $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw.target
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
@@ -786,6 +807,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_unitdir}/ceph-radosgw at .service
 %{_unitdir}/ceph-disk at .service
 %{_unitdir}/ceph.target
+%{_unitdir}/ceph-osd.target
+%{_unitdir}/ceph-mon.target
+%{_unitdir}/ceph-mds.target
+%{_unitdir}/ceph-radosgw.target
 %else
 %{_initrddir}/ceph
 %endif
@@ -939,7 +964,7 @@ exit 0
 
 %post -n ceph-common
 %if 0%{?_with_systemd}
-systemd-tmpfiles --create --prefix=/run/ceph
+%tmpfiles_create %{_tmpfilesdir}/ceph-common.conf
 %endif
 
 %postun -n ceph-common
@@ -967,6 +992,12 @@ fi
 %{_mandir}/man8/rbd-fuse.8*
 
 #################################################################################
+%files -n rbd-nbd
+%defattr(-,root,root,-)
+%{_bindir}/rbd-nbd
+%{_mandir}/man8/rbd-nbd.8*
+
+#################################################################################
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
@@ -1057,6 +1088,7 @@ fi
 %{_includedir}/rados/librados.h
 %{_includedir}/rados/librados.hpp
 %{_includedir}/rados/buffer.h
+%{_includedir}/rados/buffer_fwd.h
 %{_includedir}/rados/page.h
 %{_includedir}/rados/crc32c.h
 %{_includedir}/rados/rados_types.h
@@ -1122,7 +1154,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 #################################################################################
 %files -n python-rbd
 %defattr(-,root,root,-)
-%{python_sitelib}/rbd.py*
+%{python_sitearch}/rbd.so
+%{python_sitearch}/rbd-*.egg-info
 
 #################################################################################
 %files -n libcephfs1
diff --git a/src/test/centos-7/ceph.spec.in b/src/test/centos-7/ceph.spec.in
index 2939fef..52c5c1d 100644
--- a/src/test/centos-7/ceph.spec.in
+++ b/src/test/centos-7/ceph.spec.in
@@ -43,6 +43,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # /var/run/ceph.
 %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
 %global _with_systemd 1
+%{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
 # LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
@@ -106,6 +107,11 @@ BuildRequires:	boost-devel
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
+%if 0%{?suse_version}
+BuildRequires:	python-Cython
+%else
+BuildRequires:	Cython
+%endif
 BuildRequires:	gdbm
 BuildRequires:	hdparm
 BuildRequires:	leveldb-devel > 1.2
@@ -121,6 +127,7 @@ BuildRequires:	parted
 BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-devel
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
@@ -138,6 +145,7 @@ BuildRequires:	yasm
 %if 0%{?_with_systemd}
 BuildRequires:  pkgconfig(systemd)
 BuildRequires:	systemd-rpm-macros
+BuildRequires:	systemd
 %{?systemd_requires}
 %endif
 PreReq:		%fillup_prereq
@@ -253,6 +261,15 @@ Requires:	librbd1 = %{epoch}:%{version}-%{release}
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
+%package -n rbd-nbd
+Summary:	Ceph RBD client base on NBD
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-nbd
+NBD based client to map Ceph rbd images to local device
+
 %package radosgw
 Summary:	Rados REST gateway
 Group:		Development/Libraries
@@ -628,6 +645,10 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
   install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
   install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-osd.target $RPM_BUILD_ROOT%{_unitdir}/ceph-osd.target
+  install -m 0644 -D systemd/ceph-mon.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mon.target
+  install -m 0644 -D systemd/ceph-mds.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mds.target
+  install -m 0644 -D systemd/ceph-radosgw.target $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw.target
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
@@ -786,6 +807,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_unitdir}/ceph-radosgw at .service
 %{_unitdir}/ceph-disk at .service
 %{_unitdir}/ceph.target
+%{_unitdir}/ceph-osd.target
+%{_unitdir}/ceph-mon.target
+%{_unitdir}/ceph-mds.target
+%{_unitdir}/ceph-radosgw.target
 %else
 %{_initrddir}/ceph
 %endif
@@ -939,7 +964,7 @@ exit 0
 
 %post -n ceph-common
 %if 0%{?_with_systemd}
-systemd-tmpfiles --create --prefix=/run/ceph
+%tmpfiles_create %{_tmpfilesdir}/ceph-common.conf
 %endif
 
 %postun -n ceph-common
@@ -967,6 +992,12 @@ fi
 %{_mandir}/man8/rbd-fuse.8*
 
 #################################################################################
+%files -n rbd-nbd
+%defattr(-,root,root,-)
+%{_bindir}/rbd-nbd
+%{_mandir}/man8/rbd-nbd.8*
+
+#################################################################################
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
@@ -1057,6 +1088,7 @@ fi
 %{_includedir}/rados/librados.h
 %{_includedir}/rados/librados.hpp
 %{_includedir}/rados/buffer.h
+%{_includedir}/rados/buffer_fwd.h
 %{_includedir}/rados/page.h
 %{_includedir}/rados/crc32c.h
 %{_includedir}/rados/rados_types.h
@@ -1122,7 +1154,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 #################################################################################
 %files -n python-rbd
 %defattr(-,root,root,-)
-%{python_sitelib}/rbd.py*
+%{python_sitearch}/rbd.so
+%{python_sitearch}/rbd-*.egg-info
 
 #################################################################################
 %files -n libcephfs1
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index 7d6dde2..1654ff4 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -30,12 +30,28 @@
       import-diff                 Import an incremental diff.
       info                        Show information about image size, striping,
                                   etc.
+      journal export              Export image journal.
+      journal import              Import image journal.
+      journal info                Show information about image journal.
+      journal inspect             Inspect image journal for structural errors.
+      journal reset               Reset image journal.
+      journal status              Show status of image journal.
       list (ls)                   List rbd images.
       lock add                    Take a lock on an image.
       lock list (lock ls)         Show locks held on an image.
       lock remove (lock rm)       Release a lock on an image.
       map                         Map image to a block device using the kernel.
       merge-diff                  Merge two diff exports together.
+      mirror pool disable         Disable RBD mirroring by default within a pool.
+      mirror pool enable          Enable RBD mirroring by default within a pool.
+      mirror pool info            Show information about the pool mirroring
+                                  configuration.
+      mirror pool peer add        Add a mirroring peer to a pool.
+      mirror pool peer remove     Remove a mirroring peer from a pool.
+      mirror pool peer set        Update mirroring peer settings.
+      nbd list (nbd ls)           List the nbd devices already used.
+      nbd map                     Map image to a nbd device.
+      nbd unmap                   Unmap a nbd device.
       object-map rebuild          Rebuild an invalid object map.
       remove (rm)                 Delete an image.
       rename (mv)                 Rename image within pool.
@@ -107,31 +123,41 @@
   rbd help clone
   usage: rbd clone [--pool <pool>] [--image <image>] [--snap <snap>] 
                    [--dest-pool <dest-pool>] [--dest <dest>] [--order <order>] 
+                   [--object-size <object-size>] 
                    [--image-feature <image-feature>] [--image-shared] 
                    [--stripe-unit <stripe-unit>] [--stripe-count <stripe-count>] 
+                   [--journal-splay-width <journal-splay-width>] 
+                   [--journal-object-size <journal-object-size>] 
+                   [--journal-pool <journal-pool>] 
                    <source-snap-spec> <dest-image-spec> 
   
   Clone a snapshot into a COW child image.
   
   Positional arguments
-    <source-snap-spec>   source snapshot specification
-                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
-    <dest-image-spec>    destination image specification
-                         (example: [<pool-name>/]<image-name>)
+    <source-snap-spec>        source snapshot specification
+                              (example:
+                              [<pool-name>/]<image-name>@<snapshot-name>)
+    <dest-image-spec>         destination image specification
+                              (example: [<pool-name>/]<image-name>)
   
   Optional arguments
-    -p [ --pool ] arg    source pool name
-    --image arg          source image name
-    --snap arg           source snapshot name
-    --dest-pool arg      destination pool name
-    --dest arg           destination image name
-    --order arg          object order [12 <= order <= 25]
-    --image-feature arg  image features
-                         [layering(+), striping(+), exclusive-lock(*),
-                         object-map(*), fast-diff(*), deep-flatten, journaling(*)]
-    --image-shared       shared image
-    --stripe-unit arg    stripe unit
-    --stripe-count arg   stripe count
+    -p [ --pool ] arg         source pool name
+    --image arg               source image name
+    --snap arg                source snapshot name
+    --dest-pool arg           destination pool name
+    --dest arg                destination image name
+    --order arg               object order [12 <= order <= 25]
+    --object-size arg         object size in B/K/M [4K <= object size <= 32M]
+    --image-feature arg       image features
+                              [layering(+), striping(+), exclusive-lock(*),
+                              object-map(*), fast-diff(*), deep-flatten,
+                              journaling(*)]
+    --image-shared            shared image
+    --stripe-unit arg         stripe unit
+    --stripe-count arg        stripe count
+    --journal-splay-width arg number of active journal objects
+    --journal-object-size arg size of journal objects
+    --journal-pool arg        pool for journal objects
   
   Image Features:
     (*) supports enabling/disabling on existing images
@@ -139,7 +165,13 @@
   
   rbd help copy
   usage: rbd copy [--pool <pool>] [--image <image>] [--snap <snap>] 
-                  [--dest-pool <dest-pool>] [--dest <dest>] [--no-progress] 
+                  [--dest-pool <dest-pool>] [--dest <dest>] [--order <order>] 
+                  [--object-size <object-size>] 
+                  [--image-feature <image-feature>] [--image-shared] 
+                  [--stripe-unit <stripe-unit>] [--stripe-count <stripe-count>] 
+                  [--journal-splay-width <journal-splay-width>] 
+                  [--journal-object-size <journal-object-size>] 
+                  [--journal-pool <journal-pool>] [--no-progress] 
                   <source-image-or-snap-spec> <dest-image-spec> 
   
   Copy src image to dest.
@@ -157,36 +189,61 @@
     --snap arg                   source snapshot name
     --dest-pool arg              destination pool name
     --dest arg                   destination image name
+    --order arg                  object order [12 <= order <= 25]
+    --object-size arg            object size in B/K/M [4K <= object size <= 32M]
+    --image-feature arg          image features
+                                 [layering(+), striping(+), exclusive-lock(*),
+                                 object-map(*), fast-diff(*), deep-flatten,
+                                 journaling(*)]
+    --image-shared               shared image
+    --stripe-unit arg            stripe unit
+    --stripe-count arg           stripe count
+    --journal-splay-width arg    number of active journal objects
+    --journal-object-size arg    size of journal objects
+    --journal-pool arg           pool for journal objects
     --no-progress                disable progress output
   
+  Image Features:
+    (*) supports enabling/disabling on existing images
+    (+) enabled by default for new images if features not specified
+  
   rbd help create
   usage: rbd create [--pool <pool>] [--image <image>] 
                     [--image-format <image-format>] [--new-format] 
-                    [--order <order>] [--image-feature <image-feature>] 
-                    [--image-shared] [--stripe-unit <stripe-unit>] 
-                    [--stripe-count <stripe-count>] --size <size> 
+                    [--order <order>] [--object-size <object-size>] 
+                    [--image-feature <image-feature>] [--image-shared] 
+                    [--stripe-unit <stripe-unit>] 
+                    [--stripe-count <stripe-count>] 
+                    [--journal-splay-width <journal-splay-width>] 
+                    [--journal-object-size <journal-object-size>] 
+                    [--journal-pool <journal-pool>] --size <size> 
                     <image-spec> 
   
   Create an empty image.
   
   Positional arguments
-    <image-spec>         image specification
-                         (example: [<pool-name>/]<image-name>)
+    <image-spec>              image specification
+                              (example: [<pool-name>/]<image-name>)
   
   Optional arguments
-    -p [ --pool ] arg    pool name
-    --image arg          image name
-    --image-format arg   image format [1 or 2]
-    --new-format         use image format 2
-                         (deprecated)
-    --order arg          object order [12 <= order <= 25]
-    --image-feature arg  image features
-                         [layering(+), striping(+), exclusive-lock(*),
-                         object-map(*), fast-diff(*), deep-flatten, journaling(*)]
-    --image-shared       shared image
-    --stripe-unit arg    stripe unit
-    --stripe-count arg   stripe count
-    -s [ --size ] arg    image size (in M/G/T)
+    -p [ --pool ] arg         pool name
+    --image arg               image name
+    --image-format arg        image format [1 or 2]
+    --new-format              use image format 2
+                              (deprecated)
+    --order arg               object order [12 <= order <= 25]
+    --object-size arg         object size in B/K/M [4K <= object size <= 32M]
+    --image-feature arg       image features
+                              [layering(+), striping(+), exclusive-lock(*),
+                              object-map(*), fast-diff(*), deep-flatten,
+                              journaling(*)]
+    --image-shared            shared image
+    --stripe-unit arg         stripe unit
+    --stripe-count arg        stripe count
+    --journal-splay-width arg number of active journal objects
+    --journal-object-size arg size of journal objects
+    --journal-pool arg        pool for journal objects
+    -s [ --size ] arg         image size (in M/G/T)
   
   Image Features:
     (*) supports enabling/disabling on existing images
@@ -293,20 +350,26 @@
   
   rbd help feature enable
   usage: rbd feature enable [--pool <pool>] [--image <image>] 
+                            [--journal-splay-width <journal-splay-width>] 
+                            [--journal-object-size <journal-object-size>] 
+                            [--journal-pool <journal-pool>] 
                             <image-spec> <features> [<features> ...]
   
   Enable the specified image feature.
   
   Positional arguments
-    <image-spec>         image specification
-                         (example: [<pool-name>/]<image-name>)
-    <features>           image features
-                         [layering, striping, exclusive-lock, object-map,
-                         fast-diff, deep-flatten, journaling]
+    <image-spec>              image specification
+                              (example: [<pool-name>/]<image-name>)
+    <features>                image features
+                              [layering, striping, exclusive-lock, object-map,
+                              fast-diff, deep-flatten, journaling]
   
   Optional arguments
-    -p [ --pool ] arg    pool name
-    --image arg          image name
+    -p [ --pool ] arg         pool name
+    --image arg               image name
+    --journal-splay-width arg number of active journal objects
+    --journal-object-size arg size of journal objects
+    --journal-pool arg        pool for journal objects
   
   rbd help flatten
   usage: rbd flatten [--pool <pool>] [--image <image>] [--no-progress] 
@@ -389,36 +452,45 @@
   rbd help import
   usage: rbd import [--path <path>] [--dest-pool <dest-pool>] [--dest <dest>] 
                     [--image-format <image-format>] [--new-format] 
-                    [--order <order>] [--image-feature <image-feature>] 
-                    [--image-shared] [--stripe-unit <stripe-unit>] 
-                    [--stripe-count <stripe-count>] [--no-progress] 
+                    [--order <order>] [--object-size <object-size>] 
+                    [--image-feature <image-feature>] [--image-shared] 
+                    [--stripe-unit <stripe-unit>] 
+                    [--stripe-count <stripe-count>] 
+                    [--journal-splay-width <journal-splay-width>] 
+                    [--journal-object-size <journal-object-size>] 
+                    [--journal-pool <journal-pool>] [--no-progress] 
                     [--pool <pool>] [--image <image>] 
                     <path-name> <dest-image-spec> 
   
   Import image from file.
   
   Positional arguments
-    <path-name>          import file (or '-' for stdin)
-    <dest-image-spec>    destination image specification
-                         (example: [<pool-name>/]<image-name>)
+    <path-name>               import file (or '-' for stdin)
+    <dest-image-spec>         destination image specification
+                              (example: [<pool-name>/]<image-name>)
   
   Optional arguments
-    --path arg           import file (or '-' for stdin)
-    --dest-pool arg      destination pool name
-    --dest arg           destination image name
-    --image-format arg   image format [1 or 2]
-    --new-format         use image format 2
-                         (deprecated)
-    --order arg          object order [12 <= order <= 25]
-    --image-feature arg  image features
-                         [layering(+), striping(+), exclusive-lock(*),
-                         object-map(*), fast-diff(*), deep-flatten, journaling(*)]
-    --image-shared       shared image
-    --stripe-unit arg    stripe unit
-    --stripe-count arg   stripe count
-    --no-progress        disable progress output
-    -p [ --pool ] arg    pool name (deprecated)
-    --image arg          image name (deprecated)
+    --path arg                import file (or '-' for stdin)
+    --dest-pool arg           destination pool name
+    --dest arg                destination image name
+    --image-format arg        image format [1 or 2]
+    --new-format              use image format 2
+                              (deprecated)
+    --order arg               object order [12 <= order <= 25]
+    --object-size arg         object size in B/K/M [4K <= object size <= 32M]
+    --image-feature arg       image features
+                              [layering(+), striping(+), exclusive-lock(*),
+                              object-map(*), fast-diff(*), deep-flatten,
+                              journaling(*)]
+    --image-shared            shared image
+    --stripe-unit arg         stripe unit
+    --stripe-count arg        stripe count
+    --journal-splay-width arg number of active journal objects
+    --journal-object-size arg size of journal objects
+    --journal-pool arg        pool for journal objects
+    --no-progress             disable progress output
+    -p [ --pool ] arg         pool name (deprecated)
+    --image arg               image name (deprecated)
   
   Image Features:
     (*) supports enabling/disabling on existing images
@@ -460,6 +532,119 @@
     --format arg          output format [plain, json, or xml]
     --pretty-format       pretty formatting (json and xml)
   
+  rbd help journal export
+  usage: rbd journal export [--pool <pool>] [--image <image>] 
+                            [--journal <journal>] [--path <path>] [--verbose] 
+                            [--no-error] 
+                            <source-journal-spec> <path-name> 
+  
+  Export image journal.
+  
+  Positional arguments
+    <source-journal-spec>  source journal specification
+                           (example: [<pool-name>/]<journal-name>)
+    <path-name>            export file (or '-' for stdout)
+  
+  Optional arguments
+    -p [ --pool ] arg      source pool name
+    --image arg            source image name
+    --journal arg          source journal name
+    --path arg             export file (or '-' for stdout)
+    --verbose              be verbose
+    --no-error             continue after error
+  
+  rbd help journal import
+  usage: rbd journal import [--path <path>] [--dest-pool <dest-pool>] 
+                            [--dest <dest>] [--dest-journal <dest-journal>] 
+                            [--verbose] [--no-error] 
+                            <path-name> <dest-journal-spec> 
+  
+  Import image journal.
+  
+  Positional arguments
+    <path-name>          import file (or '-' for stdin)
+    <dest-journal-spec>  destination journal specification
+                         (example: [<pool-name>/]<journal-name>)
+  
+  Optional arguments
+    --path arg           import file (or '-' for stdin)
+    --dest-pool arg      destination pool name
+    --dest arg           destination image name
+    --dest-journal arg   destination journal name
+    --verbose            be verbose
+    --no-error           continue after error
+  
+  rbd help journal info
+  usage: rbd journal info [--pool <pool>] [--image <image>] 
+                          [--journal <journal>] [--format <format>] 
+                          [--pretty-format] 
+                          <journal-spec> 
+  
+  Show information about image journal.
+  
+  Positional arguments
+    <journal-spec>       journal specification
+                         (example: [<pool-name>/]<journal-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --journal arg        journal name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help journal inspect
+  usage: rbd journal inspect [--pool <pool>] [--image <image>] 
+                             [--journal <journal>] [--verbose] 
+                             <journal-spec> 
+  
+  Inspect image journal for structural errors.
+  
+  Positional arguments
+    <journal-spec>       journal specification
+                         (example: [<pool-name>/]<journal-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --journal arg        journal name
+    --verbose            be verbose
+  
+  rbd help journal reset
+  usage: rbd journal reset [--pool <pool>] [--image <image>] 
+                           [--journal <journal>] 
+                           <journal-spec> 
+  
+  Reset image journal.
+  
+  Positional arguments
+    <journal-spec>       journal specification
+                         (example: [<pool-name>/]<journal-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --journal arg        journal name
+  
+  rbd help journal status
+  usage: rbd journal status [--pool <pool>] [--image <image>] 
+                            [--journal <journal>] [--format <format>] 
+                            [--pretty-format] 
+                            <journal-spec> 
+  
+  Show status of image journal.
+  
+  Positional arguments
+    <journal-spec>       journal specification
+                         (example: [<pool-name>/]<journal-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --journal arg        journal name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
   rbd help list
   usage: rbd list [--long] [--pool <pool>] [--format <format>] [--pretty-format] 
                   <pool-name> 
@@ -557,6 +742,125 @@
     --path arg           path to merged diff (or '-' for stdout)
     --no-progress        disable progress output
   
+  rbd help mirror pool disable
+  usage: rbd mirror pool disable [--pool <pool>] 
+                                 <pool-name> 
+  
+  Disable RBD mirroring by default within a pool.
+  
+  Positional arguments
+    <pool-name>          pool name
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+  
+  rbd help mirror pool enable
+  usage: rbd mirror pool enable [--pool <pool>] 
+                                <pool-name> 
+  
+  Enable RBD mirroring by default within a pool.
+  
+  Positional arguments
+    <pool-name>          pool name
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+  
+  rbd help mirror pool info
+  usage: rbd mirror pool info [--pool <pool>] [--format <format>] 
+                              [--pretty-format] 
+                              <pool-name> 
+  
+  Show information about the pool mirroring configuration.
+  
+  Positional arguments
+    <pool-name>          pool name
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help mirror pool peer add
+  usage: rbd mirror pool peer add [--pool <pool>] 
+                                  [--remote-client-name <remote-client-name>] 
+                                  [--remote-cluster <remote-cluster>] 
+                                  [--remote-cluster-uuid <remote-cluster-uuid>] 
+                                  <pool-name> <remote-cluster-spec> 
+  
+  Add a mirroring peer to a pool.
+  
+  Positional arguments
+    <pool-name>               pool name
+    <remote-cluster-spec>     remote cluster spec
+                              (example: [<client name>@]<cluster name>
+  
+  Optional arguments
+    -p [ --pool ] arg         pool name
+    --remote-client-name arg  remote client name
+    --remote-cluster arg      remote cluster name
+    --remote-cluster-uuid arg remote cluster uuid
+  
+  rbd help mirror pool peer remove
+  usage: rbd mirror pool peer remove [--pool <pool>] 
+                                     <pool-name> <cluster-uuid> 
+  
+  Remove a mirroring peer from a pool.
+  
+  Positional arguments
+    <pool-name>          pool name
+    <cluster-uuid>       cluster UUID
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+  
+  rbd help mirror pool peer set
+  usage: rbd mirror pool peer set [--pool <pool>] 
+                                  <pool-name> <cluster-uuid> <key> <value> 
+  
+  Update mirroring peer settings.
+  
+  Positional arguments
+    <pool-name>          pool name
+    <cluster-uuid>       cluster UUID
+    <key>                peer parameter [client or cluster]
+    <value>              new client or cluster name
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+  
+  rbd help nbd list
+  usage: rbd nbd list 
+  
+  List the nbd devices already used.
+  
+  rbd help nbd map
+  usage: rbd nbd map [--pool <pool>] [--image <image>] [--snap <snap>] 
+                     [--read-only] [--device <device>] 
+                     <image-or-snap-spec> 
+  
+  Map image to a nbd device.
+  
+  Positional arguments
+    <image-or-snap-spec>  image or snapshot specification
+                          (example: [<pool-name>/]<image-name>[@<snap-name>])
+  
+  Optional arguments
+    -p [ --pool ] arg     pool name
+    --image arg           image name
+    --snap arg            snapshot name
+    --read-only           mount read-only
+    --device arg          specify nbd device
+  
+  rbd help nbd unmap
+  usage: rbd nbd unmap 
+                       <device-spec> 
+  
+  Unmap a nbd device.
+  
+  Positional arguments
+    <device-spec>        specify nbd device
+  
   rbd help object-map rebuild
   usage: rbd object-map rebuild [--pool <pool>] [--image <image>] 
                                 [--snap <snap>] [--no-progress] 
diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc
index b53cfa8..218505d 100644
--- a/src/test/cls_rbd/test_cls_rbd.cc
+++ b/src/test/cls_rbd/test_cls_rbd.cc
@@ -9,6 +9,7 @@
 #include "include/stringify.h"
 #include "cls/rbd/cls_rbd.h"
 #include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
 
 #include "gtest/gtest.h"
 #include "test/librados/test.h"
@@ -18,52 +19,37 @@
 #include <vector>
 
 using namespace std;
-using ::librbd::cls_client::create_image;
-using ::librbd::cls_client::get_features;
-using ::librbd::cls_client::set_features;
-using ::librbd::cls_client::get_size;
-using ::librbd::cls_client::get_object_prefix;
-using ::librbd::cls_client::set_size;
-using ::librbd::cls_client::get_parent;
-using ::librbd::cls_client::set_parent;
-using ::librbd::cls_client::remove_parent;
-using ::librbd::cls_client::snapshot_add;
-using ::librbd::cls_client::snapshot_remove;
-using ::librbd::cls_client::add_child;
-using ::librbd::cls_client::remove_child;
-using ::librbd::cls_client::get_children;
-using ::librbd::cls_client::get_snapcontext;
-using ::librbd::cls_client::snapshot_list;
-using ::librbd::cls_client::copyup;
-using ::librbd::cls_client::get_id;
-using ::librbd::cls_client::set_id;
-using ::librbd::cls_client::dir_get_id;
-using ::librbd::cls_client::dir_get_name;
-using ::librbd::cls_client::dir_list;
-using ::librbd::cls_client::dir_add_image;
-using ::librbd::cls_client::dir_remove_image;
-using ::librbd::cls_client::dir_rename_image;
+using namespace librbd::cls_client;
 using ::librbd::parent_info;
 using ::librbd::parent_spec;
-using ::librbd::cls_client::get_protection_status;
-using ::librbd::cls_client::set_protection_status;
-using ::librbd::cls_client::get_stripe_unit_count;
-using ::librbd::cls_client::set_stripe_unit_count;
-using ::librbd::cls_client::old_snapshot_add;
-using ::librbd::cls_client::get_mutable_metadata;
-using ::librbd::cls_client::object_map_load;
-using ::librbd::cls_client::object_map_save;
-using ::librbd::cls_client::object_map_resize;
-using ::librbd::cls_client::object_map_update;
-using ::librbd::cls_client::object_map_snap_add;
-using ::librbd::cls_client::object_map_snap_remove;
-using ::librbd::cls_client::get_flags;
-using ::librbd::cls_client::set_flags;
-using ::librbd::cls_client::metadata_set;
-using ::librbd::cls_client::metadata_remove;
-using ::librbd::cls_client::metadata_list;
-using ::librbd::cls_client::metadata_get;
-using ::librbd::cls_client::snapshot_rename;
+
+static int snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
+                        uint64_t snap_id, const std::string &snap_name) {
+  librados::ObjectWriteOperation op;
+  ::librbd::cls_client::snapshot_add(&op, snap_id, snap_name);
+  return ioctx->operate(oid, &op);
+}
+
+static int snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
+                           uint64_t snap_id) {
+  librados::ObjectWriteOperation op;
+  ::librbd::cls_client::snapshot_remove(&op, snap_id);
+  return ioctx->operate(oid, &op);
+}
+
+static int snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
+                           uint64_t snap_id, const std::string &snap_name) {
+  librados::ObjectWriteOperation op;
+  ::librbd::cls_client::snapshot_rename(&op, snap_id, snap_name);
+  return ioctx->operate(oid, &op);
+}
+
+static int old_snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
+                            uint64_t snap_id, const std::string &snap_name) {
+  librados::ObjectWriteOperation op;
+  ::librbd::cls_client::old_snapshot_add(&op, snap_id, snap_name);
+  return ioctx->operate(oid, &op);
+}
 
 static char *random_buf(size_t len)
 {
@@ -281,20 +267,28 @@ TEST_F(TestClsRbd, directory_methods)
   ASSERT_EQ(0, dir_get_id(&ioctx, oid, imgname2, &id));
   ASSERT_EQ(valid_id2, id);
 
-  ASSERT_EQ(-ESTALE, dir_rename_image(&ioctx, oid, imgname, imgname2, valid_id2));
+  librados::ObjectWriteOperation op1;
+  dir_rename_image(&op1, imgname, imgname2, valid_id2);
+  ASSERT_EQ(-ESTALE, ioctx.operate(oid, &op1));
   ASSERT_EQ(-ESTALE, dir_remove_image(&ioctx, oid, imgname, valid_id2));
-  ASSERT_EQ(-EEXIST, dir_rename_image(&ioctx, oid, imgname, imgname2, valid_id));
+  librados::ObjectWriteOperation op2;
+  dir_rename_image(&op2, imgname, imgname2, valid_id);
+  ASSERT_EQ(-EEXIST, ioctx.operate(oid, &op2));
   ASSERT_EQ(0, dir_get_id(&ioctx, oid, imgname, &id));
   ASSERT_EQ(valid_id, id);
   ASSERT_EQ(0, dir_get_name(&ioctx, oid, valid_id2, &name));
   ASSERT_EQ(imgname2, name);
 
-  ASSERT_EQ(0, dir_rename_image(&ioctx, oid, imgname, imgname3, valid_id));
+  librados::ObjectWriteOperation op3;
+  dir_rename_image(&op3, imgname, imgname3, valid_id);
+  ASSERT_EQ(0, ioctx.operate(oid, &op3));
   ASSERT_EQ(0, dir_get_id(&ioctx, oid, imgname3, &id));
   ASSERT_EQ(valid_id, id);
   ASSERT_EQ(0, dir_get_name(&ioctx, oid, valid_id, &name));
   ASSERT_EQ(imgname3, name);
-  ASSERT_EQ(0, dir_rename_image(&ioctx, oid, imgname3, imgname, valid_id));
+  librados::ObjectWriteOperation op4;
+  dir_rename_image(&op4, imgname3, imgname, valid_id);
+  ASSERT_EQ(0, ioctx.operate(oid, &op4));
 
   ASSERT_EQ(0, dir_remove_image(&ioctx, oid, imgname, valid_id));
   ASSERT_EQ(0, dir_list(&ioctx, oid, "", 30, &images));
@@ -1289,3 +1283,59 @@ TEST_F(TestClsRbd, set_features)
   mask = RBD_FEATURE_LAYERING;
   ASSERT_EQ(-EINVAL, set_features(&ioctx, oid, features, mask));
 }
+
+TEST_F(TestClsRbd, mirror) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::vector<cls::rbd::MirrorPeer> peers;
+  ASSERT_EQ(-ENOENT, mirror_peer_list(&ioctx, &peers));
+
+  ASSERT_EQ(-EINVAL, mirror_peer_add(&ioctx, "uuid1", "cluster1", "client"));
+
+  bool enabled;
+  ASSERT_EQ(0, mirror_is_enabled(&ioctx, &enabled));
+  ASSERT_FALSE(enabled);
+  ASSERT_EQ(0, mirror_set_enabled(&ioctx, true));
+  ASSERT_EQ(0, mirror_is_enabled(&ioctx, &enabled));
+  ASSERT_TRUE(enabled);
+
+  ASSERT_EQ(0, mirror_peer_add(&ioctx, "uuid1", "cluster1", "client"));
+  ASSERT_EQ(0, mirror_peer_add(&ioctx, "uuid2", "cluster2", "admin"));
+  ASSERT_EQ(-EEXIST, mirror_peer_add(&ioctx, "uuid2", "cluster3", "foo"));
+  ASSERT_EQ(-EEXIST, mirror_peer_add(&ioctx, "uuid3", "cluster1", "foo"));
+  ASSERT_EQ(0, mirror_peer_add(&ioctx, "uuid3", "cluster3", "admin"));
+
+  ASSERT_EQ(0, mirror_peer_list(&ioctx, &peers));
+  std::vector<cls::rbd::MirrorPeer> expected_peers = {
+    {"uuid1", "cluster1", "client"},
+    {"uuid2", "cluster2", "admin"},
+    {"uuid3", "cluster3", "admin"}};
+  ASSERT_EQ(expected_peers, peers);
+
+  ASSERT_EQ(0, mirror_peer_remove(&ioctx, "uuid4"));
+  ASSERT_EQ(0, mirror_peer_remove(&ioctx, "uuid2"));
+
+  ASSERT_EQ(-ENOENT, mirror_peer_set_client(&ioctx, "uuid4", "new client"));
+  ASSERT_EQ(0, mirror_peer_set_client(&ioctx, "uuid1", "new client"));
+
+  ASSERT_EQ(-ENOENT, mirror_peer_set_cluster(&ioctx, "uuid4", "new cluster"));
+  ASSERT_EQ(0, mirror_peer_set_cluster(&ioctx, "uuid3", "new cluster"));
+
+  ASSERT_EQ(0, mirror_peer_list(&ioctx, &peers));
+  expected_peers = {
+    {"uuid1", "cluster1", "new client"},
+    {"uuid3", "new cluster", "admin"}};
+  ASSERT_EQ(expected_peers, peers);
+  ASSERT_EQ(-EBUSY, mirror_set_enabled(&ioctx, false));
+
+  ASSERT_EQ(0, mirror_peer_remove(&ioctx, "uuid3"));
+  ASSERT_EQ(0, mirror_peer_remove(&ioctx, "uuid1"));
+  ASSERT_EQ(0, mirror_peer_list(&ioctx, &peers));
+  expected_peers = {};
+  ASSERT_EQ(expected_peers, peers);
+
+  ASSERT_EQ(0, mirror_set_enabled(&ioctx, false));
+  ASSERT_EQ(0, mirror_is_enabled(&ioctx, &enabled));
+  ASSERT_FALSE(enabled);
+}
diff --git a/src/test/common/ObjectContents.h b/src/test/common/ObjectContents.h
index 8ca410b..0f467b1 100644
--- a/src/test/common/ObjectContents.h
+++ b/src/test/common/ObjectContents.h
@@ -1,6 +1,6 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 #include "include/interval_set.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include <map>
 
 #ifndef COMMON_OBJECT_H
diff --git a/src/test/common/test_tableformatter.cc b/src/test/common/test_tableformatter.cc
index 88648f0..ff99e84 100644
--- a/src/test/common/test_tableformatter.cc
+++ b/src/test/common/test_tableformatter.cc
@@ -5,6 +5,8 @@
 #include <sstream>
 #include <string>
 
+using namespace ceph;
+
 TEST(tableformatter, singleline)
 {
   std::stringstream sout;
diff --git a/src/test/encoding/readable.sh b/src/test/encoding/readable.sh
index f387bd1..2116f45 100755
--- a/src/test/encoding/readable.sh
+++ b/src/test/encoding/readable.sh
@@ -12,85 +12,120 @@ numtests=0
 
 myversion=`./ceph-dencoder version`
 
-for arversion in `ls -v $dir/archive`
-do
-    vdir="$dir/archive/$arversion"
-#    echo $vdir
+for arversion in `ls -v $dir/archive`; do
+  vdir="$dir/archive/$arversion"
+  #echo $vdir
 
-    if [ ! -d "$vdir/objects" ]; then
-	continue;
-    fi
+  if [ ! -d "$vdir/objects" ]; then
+    continue;
+  fi
+
+  for type in `ls $vdir/objects`; do
+    if ./ceph-dencoder type $type 2>/dev/null; then
+      #echo "type $type";
+      echo "        $vdir/objects/$type"
+
+      # is there a fwd incompat change between $arversion and $version?
+      incompat=""
+      incompat_paths=""
+      sawarversion=0
+      for iv in `ls -v $dir/archive`; do
+        if [ "$iv" = "$arversion" ]; then
+          sawarversion=1
+        fi
+
+        if [ $sawarversion -eq 1 ] && [ -e "$dir/archive/$iv/forward_incompat/$type" ]; then
+          incompat="$iv"
+
+          # Check if we'll be ignoring only specified objects, not whole type. If so, remember
+          # all paths for this type into variable. Assuming that this path won't contain any
+          # whitechars (implication of above for loop).
+          if [ -d "$dir/archive/$iv/forward_incompat/$type" ]; then
+            if [ -n "`ls -v $dir/archive/$iv/forward_incompat/$type/`" ]; then
+              incompat_paths="$incompat_paths $dir/archive/$iv/forward_incompat/$type"
+            else
+              echo "type $type directory empty, ignoring whole type instead of single objects"
+            fi;
+          fi
+        fi
+
+        if [ "$iv" = "$version" ]; then
+          break
+        fi
+      done
+
+      if [ -n "$incompat" ]; then
+        if [ -z "$incompat_paths" ]; then
+          echo "skipping incompat $type version $arversion, changed at $incompat < code $myversion"
+          continue
+        else
+          # If we are ignoring not whole type, but objects that are in $incompat_path,
+          # we don't skip here, just give info.
+          echo "postponed skip one of incompact $type version $arversion, changed at $incompat < code $myversion"
+        fi;
+      fi
 
-    for type in `ls $vdir/objects`
-    do
-	if ./ceph-dencoder type $type 2>/dev/null; then
-#	    echo "type $type";
-	    echo "        $vdir/objects/$type"
-
-	    # is there a fwd incompat change between $arversion and $version?
-	    incompat=""
-	    sawarversion=0
-	    for iv in `ls -v $dir/archive`
-	    do
-		if [ "$iv" = "$arversion" ]; then
-		    sawarversion=1
-		fi
-		if [ $sawarversion -eq 1 ] && [ -e "$dir/archive/$iv/forward_incompat/$type" ]; then
-		    incompat="$iv"
-		fi
-		if [ "$iv" = "$version" ]; then
-		    break
-		fi
-	    done
-	    if [ -n "$incompat" ]; then
-		echo "skipping incompat $type version $arversion, changed at $iv < code $myversion"
-		continue
-	    fi
-
-	    for f in `ls $vdir/objects/$type`; do
-#		echo "\t$vdir/$type/$f"
-		if ! ./ceph-dencoder type $type import $vdir/objects/$type/$f decode dump_json > $tmp1; then
-		    echo "**** failed to decode $vdir/objects/$type/$f ****"
-		    failed=$(($failed + 1))
-		    continue	    
-		fi
-		if ! ./ceph-dencoder type $type import $vdir/objects/$type/$f decode encode decode dump_json > $tmp2; then
-		    echo "**** failed to decode+encode+decode $vdir/objects/$type/$f ****"
-		    failed=$(($failed + 1))
-		    continue
-		fi
-
-		# nondeterministic classes may dump
-		# nondeterministically.  compare the sorted json
-		# output.  this is a weaker test, but is better than
-		# nothing.
-		if ! ./ceph-dencoder type $type is_deterministic
-		then
-		    echo "  sorting json output for nondeterministic object"
-		    for f in $tmp1 $tmp2; do
-			sort $f | sed 's/,$//' > $f.new
-			mv $f.new $f
-		    done
-		fi
-
-		if ! cmp $tmp1 $tmp2; then
-		    echo "**** reencode of $vdir/objects/$type/$f resulted in a different dump ****"
-		    diff $tmp1 $tmp2
-		    failed=$(($failed + 1))
-	    	fi
-		numtests=$(($numtests + 1))
-	    done
-	else
-            echo "skipping unrecognized type $type"
-	fi
-    done
+      for f in `ls $vdir/objects/$type`; do
+
+        skip=0;
+        # Check if processed object $f of $type should be skipped (postponed skip)
+        if [ -n "$incompat_paths" ]; then
+            for i_path in $incompat_paths; do
+              # Check if $f is a symbolic link and if it's pointing to existing target
+              if [ -L "$i_path/$f" ]; then
+                echo "skipping object $f of type $type"
+                skip=1
+                break
+              fi;
+            done;
+        fi;
+
+        if [ $skip -ne 0 ]; then
+          continue
+        fi;
+
+        #echo "\t$vdir/$type/$f"
+        if ! ./ceph-dencoder type $type import $vdir/objects/$type/$f decode dump_json > $tmp1; then
+          echo "**** failed to decode $vdir/objects/$type/$f ****"
+          failed=$(($failed + 1))
+          continue      
+        fi
+        if ! ./ceph-dencoder type $type import $vdir/objects/$type/$f decode encode decode dump_json > $tmp2; then
+          echo "**** failed to decode+encode+decode $vdir/objects/$type/$f ****"
+          failed=$(($failed + 1))
+          continue
+        fi
+
+        # nondeterministic classes may dump
+        # nondeterministically.  compare the sorted json
+        # output.  this is a weaker test, but is better than
+        # nothing.
+        if ! ./ceph-dencoder type $type is_deterministic; then
+          echo "  sorting json output for nondeterministic object"
+          for f in $tmp1 $tmp2; do
+            sort $f | sed 's/,$//' > $f.new
+            mv $f.new $f
+          done
+        fi
+
+        if ! cmp $tmp1 $tmp2; then
+          echo "**** reencode of $vdir/objects/$type/$f resulted in a different dump ****"
+          diff $tmp1 $tmp2
+          failed=$(($failed + 1))
+        fi
+        numtests=$(($numtests + 1))
+      done
+    else
+      echo "skipping unrecognized type $type"
+    fi
+  done
 done
 
 rm -f $tmp1 $tmp2
 
 if [ $failed -gt 0 ]; then
-    echo "FAILED $failed / $numtests tests."
-    exit 1
+  echo "FAILED $failed / $numtests tests."
+  exit 1
 fi
 echo "passed $numtests tests."
 
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index a9dbc9b..26e672c 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -226,15 +226,17 @@ TYPE(ETableServer)
 #include "mds/events/EUpdate.h"
 TYPE(EUpdate)
 
+#ifdef WITH_RBD
 #include "librbd/JournalTypes.h"
 TYPE(librbd::journal::EventEntry)
 #include "librbd/WatchNotifyTypes.h"
-TYPE(librbd::WatchNotify::NotifyMessage)
-TYPE(librbd::WatchNotify::ResponseMessage)
+TYPE(librbd::watch_notify::NotifyMessage)
+TYPE(librbd::watch_notify::ResponseMessage)
 
 #include "rbd_replay/ActionTypes.h"
 TYPE(rbd_replay::action::Dependency)
 TYPE(rbd_replay::action::ActionEntry);
+#endif
 
 #ifdef WITH_RADOSGW
 
@@ -331,10 +333,15 @@ TYPE(rgw_obj)
 #include "rgw/rgw_log.h"
 TYPE(rgw_log_entry)
 
+#ifdef WITH_RBD
 #include "cls/rbd/cls_rbd.h"
 TYPE(cls_rbd_parent)
 TYPE(cls_rbd_snap)
 
+#include "cls/rbd/cls_rbd_types.h"
+TYPE(cls::rbd::MirrorPeer)
+#endif
+
 #endif
 
 #include "cls/lock/cls_lock_types.h"
diff --git a/src/test/erasure-code/ceph_erasure_code.cc b/src/test/erasure-code/ceph_erasure_code.cc
index 00d4496..a488366 100644
--- a/src/test/erasure-code/ceph_erasure_code.cc
+++ b/src/test/erasure-code/ceph_erasure_code.cc
@@ -188,7 +188,6 @@ int main(int argc, char** argv) {
  *   libtool --mode=execute valgrind --tool=memcheck --leak-check=full \
  *      ./ceph_erasure_code \
  *      --parameter plugin=jerasure \
- *      --parameter directory=.libs \
  *      --parameter technique=reed_sol_van \
  *      --parameter k=2 \
  *      --parameter m=2 \
diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
index 052d8fe..31a73d4 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
@@ -87,7 +87,6 @@ int ErasureCodeBench::setup(int argc, char** argv) {
     CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
   common_init_finish(g_ceph_context);
   g_ceph_context->_conf->apply_changes(NULL);
-  g_conf->set_val("erasure_code_dir", ".libs", false, false);
 
   if (vm.count("help")) {
     cout << desc << std::endl;
diff --git a/src/test/erasure-code/ceph_erasure_code_non_regression.cc b/src/test/erasure-code/ceph_erasure_code_non_regression.cc
index 91e251f..bc65123 100644
--- a/src/test/erasure-code/ceph_erasure_code_non_regression.cc
+++ b/src/test/erasure-code/ceph_erasure_code_non_regression.cc
@@ -330,7 +330,6 @@ int main(int argc, char** argv) {
  *   libtool --mode=execute valgrind --tool=memcheck --leak-check=full \
  *      ./ceph_erasure_code_non_regression \
  *      --plugin jerasure \
- *      --parameter directory=.libs \
  *      --parameter technique=reed_sol_van \
  *      --parameter k=2 \
  *      --parameter m=2 \
diff --git a/src/test/fedora-21/ceph.spec.in b/src/test/fedora-21/ceph.spec.in
index 2939fef..52c5c1d 100644
--- a/src/test/fedora-21/ceph.spec.in
+++ b/src/test/fedora-21/ceph.spec.in
@@ -43,6 +43,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # /var/run/ceph.
 %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
 %global _with_systemd 1
+%{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
 # LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
@@ -106,6 +107,11 @@ BuildRequires:	boost-devel
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
+%if 0%{?suse_version}
+BuildRequires:	python-Cython
+%else
+BuildRequires:	Cython
+%endif
 BuildRequires:	gdbm
 BuildRequires:	hdparm
 BuildRequires:	leveldb-devel > 1.2
@@ -121,6 +127,7 @@ BuildRequires:	parted
 BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-devel
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
@@ -138,6 +145,7 @@ BuildRequires:	yasm
 %if 0%{?_with_systemd}
 BuildRequires:  pkgconfig(systemd)
 BuildRequires:	systemd-rpm-macros
+BuildRequires:	systemd
 %{?systemd_requires}
 %endif
 PreReq:		%fillup_prereq
@@ -253,6 +261,15 @@ Requires:	librbd1 = %{epoch}:%{version}-%{release}
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
+%package -n rbd-nbd
+Summary:	Ceph RBD client base on NBD
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-nbd
+NBD based client to map Ceph rbd images to local device
+
 %package radosgw
 Summary:	Rados REST gateway
 Group:		Development/Libraries
@@ -628,6 +645,10 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
   install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
   install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-osd.target $RPM_BUILD_ROOT%{_unitdir}/ceph-osd.target
+  install -m 0644 -D systemd/ceph-mon.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mon.target
+  install -m 0644 -D systemd/ceph-mds.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mds.target
+  install -m 0644 -D systemd/ceph-radosgw.target $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw.target
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
@@ -786,6 +807,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_unitdir}/ceph-radosgw at .service
 %{_unitdir}/ceph-disk at .service
 %{_unitdir}/ceph.target
+%{_unitdir}/ceph-osd.target
+%{_unitdir}/ceph-mon.target
+%{_unitdir}/ceph-mds.target
+%{_unitdir}/ceph-radosgw.target
 %else
 %{_initrddir}/ceph
 %endif
@@ -939,7 +964,7 @@ exit 0
 
 %post -n ceph-common
 %if 0%{?_with_systemd}
-systemd-tmpfiles --create --prefix=/run/ceph
+%tmpfiles_create %{_tmpfilesdir}/ceph-common.conf
 %endif
 
 %postun -n ceph-common
@@ -967,6 +992,12 @@ fi
 %{_mandir}/man8/rbd-fuse.8*
 
 #################################################################################
+%files -n rbd-nbd
+%defattr(-,root,root,-)
+%{_bindir}/rbd-nbd
+%{_mandir}/man8/rbd-nbd.8*
+
+#################################################################################
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
@@ -1057,6 +1088,7 @@ fi
 %{_includedir}/rados/librados.h
 %{_includedir}/rados/librados.hpp
 %{_includedir}/rados/buffer.h
+%{_includedir}/rados/buffer_fwd.h
 %{_includedir}/rados/page.h
 %{_includedir}/rados/crc32c.h
 %{_includedir}/rados/rados_types.h
@@ -1122,7 +1154,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 #################################################################################
 %files -n python-rbd
 %defattr(-,root,root,-)
-%{python_sitelib}/rbd.py*
+%{python_sitearch}/rbd.so
+%{python_sitearch}/rbd-*.egg-info
 
 #################################################################################
 %files -n libcephfs1
diff --git a/src/test/journal/test_JournalPlayer.cc b/src/test/journal/test_JournalPlayer.cc
index adc445a..c4c2f92 100644
--- a/src/test/journal/test_JournalPlayer.cc
+++ b/src/test/journal/test_JournalPlayer.cc
@@ -57,8 +57,8 @@ public:
     RadosTestFixture::TearDown();
   }
 
-  int create(const std::string &oid) {
-    return RadosTestFixture::create(oid, 14, 2);
+  int create(const std::string &oid, uint8_t splay_width = 2) {
+    return RadosTestFixture::create(oid, 14, splay_width);
   }
 
   int client_register(const std::string &oid) {
@@ -352,3 +352,44 @@ TEST_F(TestJournalPlayer, PrefetchAndWatch) {
   expected_entries = {create_entry("tag1", 124)};
   ASSERT_EQ(expected_entries, entries);
 }
+
+TEST_F(TestJournalPlayer, PrefetchSkippedObject) {
+  std::string oid = get_temp_oid();
+
+  cls::journal::ObjectSetPosition commit_position;
+
+  ASSERT_EQ(0, create(oid, 3));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  metadata->set_active_set(2);
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 122));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 123));
+  ASSERT_EQ(0, write_entry(oid, 5, "tag1", 124));
+  ASSERT_EQ(0, write_entry(oid, 6, "tag1", 125));
+  ASSERT_EQ(0, write_entry(oid, 7, "tag1", 126));
+
+  player->prefetch();
+
+  Entries entries;
+  ASSERT_TRUE(wait_for_entries(player, 5, &entries));
+  ASSERT_TRUE(wait_for_complete(player));
+
+  Entries expected_entries;
+  expected_entries = {
+    create_entry("tag1", 122),
+    create_entry("tag1", 123),
+    create_entry("tag1", 124),
+    create_entry("tag1", 125),
+    create_entry("tag1", 126)};
+  ASSERT_EQ(expected_entries, entries);
+
+  uint64_t last_tid;
+  ASSERT_TRUE(metadata->get_last_allocated_tid("tag1", &last_tid));
+  ASSERT_EQ(126U, last_tid);
+}
diff --git a/src/test/journal/test_JournalTrimmer.cc b/src/test/journal/test_JournalTrimmer.cc
index ff1f6b8..18572aa 100644
--- a/src/test/journal/test_JournalTrimmer.cc
+++ b/src/test/journal/test_JournalTrimmer.cc
@@ -163,7 +163,7 @@ TEST_F(TestJournalTrimmer, RemoveObjects) {
 
   journal::JournalTrimmer *trimmer = create_trimmer(oid, metadata);
 
-  ASSERT_EQ(0, trimmer->remove_objects());
+  ASSERT_EQ(0, trimmer->remove_objects(false));
   ASSERT_TRUE(wait_for_update(metadata));
 
   ASSERT_EQ(-ENOENT, assert_exists(oid + ".0"));
@@ -183,6 +183,7 @@ TEST_F(TestJournalTrimmer, RemoveObjectsWithOtherClient) {
   ASSERT_TRUE(wait_for_update(metadata));
 
   journal::JournalTrimmer *trimmer = create_trimmer(oid, metadata);
-  ASSERT_EQ(-EBUSY, trimmer->remove_objects());
+  ASSERT_EQ(-EBUSY, trimmer->remove_objects(false));
+  ASSERT_EQ(0, trimmer->remove_objects(true));
 }
 
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index 1c203ec..6b088df 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -270,17 +270,8 @@ TEST(LibCephFS, DirLs) {
   ASSERT_TRUE(result != NULL);
   ASSERT_STREQ(result->d_name, "..");
 
-  std::vector<std::pair<char *, int> > entries;
-  // check readdir and capture stream order for future tests
-  for(i = 0; i < r; ++i) {
-
-    result = ceph_readdir(cmount, ls_dir);
-    ASSERT_TRUE(result != NULL);
-
-    int size;
-    sscanf(result->d_name, "dirf%d", &size);
-    entries.push_back(std::pair<char*,int>(strdup(result->d_name), size));
-  }
+  for(i = 0; i < r; ++i)
+    ASSERT_TRUE(ceph_readdir(cmount, ls_dir) != NULL);
 
   ASSERT_TRUE(ceph_readdir(cmount, ls_dir) == NULL);
 
@@ -294,16 +285,6 @@ TEST(LibCephFS, DirLs) {
   ASSERT_TRUE(result != NULL);
   ASSERT_STREQ(result->d_name, "..");
 
-  // check telldir
-  for(i = 0; i < r-1; ++i) {
-    int r = ceph_telldir(cmount, ls_dir);
-    ASSERT_GT(r, -1);
-    ceph_seekdir(cmount, ls_dir, r);
-    result = ceph_readdir(cmount, ls_dir);
-    ASSERT_TRUE(result != NULL);
-    ASSERT_STREQ(result->d_name, entries[i].first);
-  }
-
   ceph_rewinddir(cmount, ls_dir);
 
   int t = ceph_telldir(cmount, ls_dir);
@@ -319,23 +300,30 @@ TEST(LibCephFS, DirLs) {
   getdents_entries = (struct dirent *)malloc(r * sizeof(*getdents_entries));
 
   int count = 0;
+  std::set<std::string> found;
   while (count < r) {
     int len = ceph_getdents(cmount, ls_dir, (char *)getdents_entries, r * sizeof(*getdents_entries));
     ASSERT_GT(len, 0);
     ASSERT_TRUE((len % sizeof(*getdents_entries)) == 0);
     int n = len / sizeof(*getdents_entries);
+    int j;
     if (count == 0) {
       ASSERT_STREQ(getdents_entries[0].d_name, ".");
       ASSERT_STREQ(getdents_entries[1].d_name, "..");
+      j = 2;
+      count += n - 2;
+    } else {
+      j = 0;
+      count += n;
     }
-    int j;
-    i = count;
-    for(j = 2; j < n; ++i, ++j) {
-      ASSERT_STREQ(getdents_entries[j].d_name, entries[i].first);
+    for(; j < n; ++i, ++j) {
+      const char *name = getdents_entries[j].d_name;
+      ASSERT_TRUE(found.count(name) == 0);
+      found.insert(name);
     }
-    count += n;
   }
 
+  ASSERT_EQ(count, r);
   free(getdents_entries);
 
   // test readdir_r
@@ -348,10 +336,12 @@ TEST(LibCephFS, DirLs) {
   ASSERT_TRUE(result != NULL);
   ASSERT_STREQ(result->d_name, "..");
 
+  found.clear();
   for(i = 0; i < r; ++i) {
     struct dirent rdent;
     ASSERT_EQ(ceph_readdir_r(cmount, ls_dir, &rdent), 1);
-    ASSERT_STREQ(rdent.d_name, entries[i].first);
+    ASSERT_TRUE(found.count(rdent.d_name) ==  0);
+    found.insert(rdent.d_name);
   }
 
   // test readdirplus
@@ -364,13 +354,18 @@ TEST(LibCephFS, DirLs) {
   ASSERT_TRUE(result != NULL);
   ASSERT_STREQ(result->d_name, "..");
 
+  found.clear();
   for(i = 0; i < r; ++i) {
     struct dirent rdent;
     struct stat st;
     int stmask;
     ASSERT_EQ(ceph_readdirplus_r(cmount, ls_dir, &rdent, &st, &stmask), 1);
-    ASSERT_STREQ(rdent.d_name, entries[i].first);
-    ASSERT_EQ(st.st_size, entries[i].second);
+    const char *name = rdent.d_name;
+    ASSERT_TRUE(found.count(name) == 0);
+    found.insert(name);
+    int size;
+    sscanf(name, "dirf%d", &size);
+    ASSERT_EQ(st.st_size, size);
     ASSERT_EQ(st.st_ino, rdent.d_ino);
     //ASSERT_EQ(st.st_mode, (mode_t)0666);
   }
diff --git a/src/test/librados/c_read_operations.cc b/src/test/librados/c_read_operations.cc
index d80fcdf..7fab5cf 100644
--- a/src/test/librados/c_read_operations.cc
+++ b/src/test/librados/c_read_operations.cc
@@ -101,10 +101,11 @@ protected:
     while (i < len) {
       ASSERT_EQ(0, rados_getxattrs_next(iter, (const char**) &key,
 					(const char**) &val, &val_len));
-      if (key == NULL || (val_len == 0 && val == NULL))
+      if (key == NULL)
 	break;
       EXPECT_EQ(std::string(keys[i]), std::string(key));
-      EXPECT_EQ(0, memcmp(vals[i], val, val_len));
+      if (val != NULL)
+        EXPECT_EQ(0, memcmp(vals[i], val, val_len));
       EXPECT_EQ(lens[i], val_len);
       ++i;
     }
diff --git a/src/test/librados/cls.cc b/src/test/librados/cls.cc
index 1f61664..10744b2 100644
--- a/src/test/librados/cls.cc
+++ b/src/test/librados/cls.cc
@@ -9,7 +9,6 @@
 #include <string>
 
 using namespace librados;
-using ceph::buffer;
 using std::map;
 using std::ostringstream;
 using std::string;
diff --git a/src/test/librados/cmd.cc b/src/test/librados/cmd.cc
index 4f327a0..d5e9d71 100644
--- a/src/test/librados/cmd.cc
+++ b/src/test/librados/cmd.cc
@@ -18,7 +18,6 @@
 #include <string>
 
 using namespace librados;
-using ceph::buffer;
 using std::map;
 using std::ostringstream;
 using std::string;
diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc
index cb37c45..e4af869 100644
--- a/src/test/librados/io.cc
+++ b/src/test/librados/io.cc
@@ -559,11 +559,11 @@ TEST_F(LibRadosIo, XattrIter) {
       break;
     }
     ASSERT_LT(num_seen, 2);
-    if ((strcmp(name, attr1) == 0) && (memcmp(val, attr1_buf, len) == 0)) {
+    if ((strcmp(name, attr1) == 0) && (val != NULL) && (memcmp(val, attr1_buf, len) == 0)) {
       num_seen++;
       continue;
     }
-    else if ((strcmp(name, attr2) == 0) && (memcmp(val, attr2_buf, len) == 0)) {
+    else if ((strcmp(name, attr2) == 0) && (val != NULL) && (memcmp(val, attr2_buf, len) == 0)) {
       num_seen++;
       continue;
     }
@@ -1109,11 +1109,11 @@ TEST_F(LibRadosIoEC, XattrIter) {
       break;
     }
     ASSERT_LT(num_seen, 2);
-    if ((strcmp(name, attr1) == 0) && (memcmp(val, attr1_buf, len) == 0)) {
+    if ((strcmp(name, attr1) == 0) && (val != NULL) && (memcmp(val, attr1_buf, len) == 0)) {
       num_seen++;
       continue;
     }
-    else if ((strcmp(name, attr2) == 0) && (memcmp(val, attr2_buf, len) == 0)) {
+    else if ((strcmp(name, attr2) == 0) && (val != NULL) && (memcmp(val, attr2_buf, len) == 0)) {
       num_seen++;
       continue;
     }
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 4adaa6b..0fb67e0 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -21,7 +21,6 @@
 #include <string>
 
 using namespace librados;
-using ceph::buffer;
 using std::map;
 using std::ostringstream;
 using std::string;
diff --git a/src/test/librados/test.cc b/src/test/librados/test.cc
index 48fbf96..b82936a 100644
--- a/src/test/librados/test.cc
+++ b/src/test/librados/test.cc
@@ -45,12 +45,41 @@ std::string create_one_pool(const std::string &pool_name, rados_t *cluster)
   return "";
 }
 
-int destroy_ec_profile(rados_t *cluster)
+int destroy_ec_profile(rados_t *cluster, std::ostream &oss)
 {
-    char *cmd[2];
-    cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
-    cmd[1] = NULL;
-    return rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+  char *cmd[2];
+  cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
+  cmd[1] = NULL;
+  int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+  if (ret)
+    oss << "rados_mon_command: erasure-code-profile rm testprofile failed with error " << ret;
+  return ret;
+}
+
+int destroy_ruleset(rados_t *cluster,
+                    std::string ruleset,
+                    std::ostream &oss)
+{
+  char *cmd[2];
+  std::string tmp = ("{\"prefix\": \"osd crush rule rm\", \"name\":\"" +
+                     ruleset + "\"}");
+  cmd[0] = (char*)tmp.c_str();
+  cmd[1] = NULL;
+  int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+  if (ret)
+    oss << "rados_mon_command: osd crush rule rm " + ruleset + " failed with error " << ret;
+  return ret;
+}
+
+int destroy_ec_profile_and_ruleset(rados_t *cluster,
+                                   std::string ruleset,
+                                   std::ostream &oss)
+{
+  int ret;
+  ret = destroy_ec_profile(cluster, oss);
+  if (ret)
+    return ret;
+  return destroy_ruleset(cluster, ruleset, oss);
 }
 
 std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
@@ -59,11 +88,10 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
   if (err.length())
     return err;
 
-  int ret = destroy_ec_profile(cluster);
+  std::ostringstream oss;
+  int ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss);
   if (ret) {
     rados_shutdown(*cluster);
-    std::ostringstream oss;
-    oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret;
     return oss.str();
   }
     
@@ -74,8 +102,6 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
   cmd[0] = (char *)profile_create.c_str();
   ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
   if (ret) {
-    std::ostringstream oss;
-
     rados_shutdown(*cluster);
     oss << "rados_mon_command erasure-code-profile set name:testprofile failed with error " << ret;
     return oss.str();
@@ -86,12 +112,7 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
   cmd[0] = (char *)cmdstr.c_str();
   ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
   if (ret) {
-    std::ostringstream oss;
-
-    int ret2 = destroy_ec_profile(cluster);
-    if (ret2)
-      oss << "rados_mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl;
-
+    destroy_ec_profile(cluster, oss);
     rados_shutdown(*cluster);
     oss << "rados_mon_command osd pool create failed with error " << ret;
     return oss.str();
@@ -116,11 +137,37 @@ std::string create_one_pool_pp(const std::string &pool_name, Rados &cluster)
   return "";
 }
 
-int destroy_ec_profile_pp(Rados &cluster)
+int destroy_ruleset_pp(Rados &cluster,
+                       std::string ruleset,
+                       std::ostream &oss)
+{
+  bufferlist inbl;
+  int ret = cluster.mon_command("{\"prefix\": \"osd crush rule rm\", \"name\":\"" +
+                                ruleset + "\"}", inbl, NULL, NULL);
+  if (ret)
+    oss << "mon_command: osd crush rule rm " + ruleset + " failed with error " << ret << std::endl;
+  return ret;
+}
+
+int destroy_ec_profile_pp(Rados &cluster, std::ostream &oss)
 {
   bufferlist inbl;
-  return cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
-                             inbl, NULL, NULL);
+  int ret = cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
+                                inbl, NULL, NULL);
+  if (ret)
+    oss << "mon_command: osd erasure-code-profile rm testprofile failed with error " << ret << std::endl;
+  return ret;
+}
+
+int destroy_ec_profile_and_ruleset_pp(Rados &cluster,
+                                      std::string ruleset,
+                                      std::ostream &oss)
+{
+  int ret;
+  ret = destroy_ec_profile_pp(cluster, oss);
+  if (ret)
+    return ret;
+  return destroy_ruleset_pp(cluster, ruleset, oss);
 }
 
 std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
@@ -129,11 +176,10 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
   if (err.length())
     return err;
 
-  int ret = destroy_ec_profile_pp(cluster);
+  std::ostringstream oss;
+  int ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss);
   if (ret) {
     cluster.shutdown();
-    std::ostringstream oss;
-    oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret;
     return oss.str();
   }
 
@@ -143,7 +189,6 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
     inbl, NULL, NULL);
   if (ret) {
     cluster.shutdown();
-    std::ostringstream oss;
     oss << "mon_command erasure-code-profile set name:testprofile failed with error " << ret;
     return oss.str();
   }
@@ -152,12 +197,8 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
     "{\"prefix\": \"osd pool create\", \"pool\": \"" + pool_name + "\", \"pool_type\":\"erasure\", \"pg_num\":8, \"pgp_num\":8, \"erasure_code_profile\":\"testprofile\"}",
     inbl, NULL, NULL);
   if (ret) {
-    std::ostringstream oss;
     bufferlist inbl;
-    int ret2 = destroy_ec_profile_pp(cluster);
-    if (ret2)
-      oss << "mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl;
-
+    destroy_ec_profile_pp(cluster, oss);
     cluster.shutdown();
     oss << "mon_command osd pool create pool:" << pool_name << " pool_type:erasure failed with error " << ret;
     return oss.str();
@@ -241,14 +282,19 @@ int destroy_one_pool(const std::string &pool_name, rados_t *cluster)
 int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster)
 {
   int ret = rados_pool_delete(*cluster, pool_name.c_str());
-  if (ret == 0) {
-    int ret2 = destroy_ec_profile(cluster);
-    if (ret2) {
-      rados_shutdown(*cluster);
-      return ret2;
-    }
-    rados_wait_for_latest_osdmap(*cluster);
+  if (ret) {
+    rados_shutdown(*cluster);
+    return ret;
+  }
+
+  std::ostringstream oss;
+  ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss);
+  if (ret) {
+    rados_shutdown(*cluster);
+    return ret;
   }
+
+  rados_wait_for_latest_osdmap(*cluster);
   rados_shutdown(*cluster);
   return ret;
 }
@@ -267,15 +313,19 @@ int destroy_one_pool_pp(const std::string &pool_name, Rados &cluster)
 int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
 {
   int ret = cluster.pool_delete(pool_name.c_str());
-  bufferlist inbl;
-  if (ret == 0) {
-    int ret2 = destroy_ec_profile_pp(cluster);
-    if (ret2) {
-      cluster.shutdown();
-      return ret2;
-    }
-    cluster.wait_for_latest_osdmap();
+  if (ret) {
+    cluster.shutdown();
+    return ret;
   }
+
+  std::ostringstream oss;
+  ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss);
+  if (ret) {
+    cluster.shutdown();
+    return ret;
+  }
+
+  cluster.wait_for_latest_osdmap();
   cluster.shutdown();
   return ret;
 }
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 12ccfc2..6517f82 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -26,7 +26,6 @@
 #include <string>
 
 using namespace librados;
-using ceph::buffer;
 using std::map;
 using std::ostringstream;
 using std::string;
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index 75f7fde..99a8f44 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -661,6 +661,19 @@ void ObjectReadOperation::list_snaps(snap_set_t *out_snaps, int *prval) {
   o->ops.push_back(op);
 }
 
+void ObjectReadOperation::list_watchers(std::list<obj_watch_t> *out_watchers,
+                                        int *prval) {
+  TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
+
+  ObjectOperationTestImpl op = boost::bind(&TestIoCtxImpl::list_watchers, _1,
+                                           _2, out_watchers);
+  if (prval != NULL) {
+    op = boost::bind(save_operation_result,
+                     boost::bind(op, _1, _2, _3, _4), prval);
+  }
+  o->ops.push_back(op);
+}
+
 void ObjectReadOperation::read(size_t off, uint64_t len, bufferlist *pbl,
                                int *prval) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
@@ -698,6 +711,19 @@ void ObjectReadOperation::sparse_read(uint64_t off, uint64_t len,
   o->ops.push_back(op);
 }
 
+void ObjectReadOperation::stat(uint64_t *psize, time_t *pmtime, int *prval) {
+  TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
+
+  ObjectOperationTestImpl op = boost::bind(&TestIoCtxImpl::stat, _1, _2,
+                                           psize, pmtime);
+
+  if (prval != NULL) {
+    op = boost::bind(save_operation_result,
+                     boost::bind(op, _1, _2, _3, _4), prval);
+  }
+  o->ops.push_back(op);
+}
+
 void ObjectWriteOperation::append(const bufferlist &bl) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
   o->ops.push_back(boost::bind(&TestIoCtxImpl::append, _1, _2, bl, _4));
@@ -731,6 +757,13 @@ void ObjectWriteOperation::set_alloc_hint(uint64_t expected_object_size,
 			       expected_object_size, expected_write_size));
 }
 
+
+void ObjectWriteOperation::tmap_update(const bufferlist& cmdbl) {
+  TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
+  o->ops.push_back(boost::bind(&TestIoCtxImpl::tmap_update, _1, _2,
+                               cmdbl));
+}
+
 void ObjectWriteOperation::truncate(uint64_t off) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
   o->ops.push_back(boost::bind(&TestIoCtxImpl::truncate, _1, _2, off, _4));
@@ -789,6 +822,11 @@ config_t Rados::cct() {
   return reinterpret_cast<config_t>(impl->cct());
 }
 
+int Rados::cluster_fsid(std::string* fsid) {
+  *fsid = "00000000-1111-2222-3333-444444444444";
+  return 0;
+}
+
 int Rados::conf_set(const char *option, const char *value) {
   return rados_conf_set(reinterpret_cast<rados_t>(client), option, value);
 }
diff --git a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
index 198db6c..b0a481b 100644
--- a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
+++ b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
@@ -48,6 +48,13 @@ public:
                                   snapc);
   }
 
+  MOCK_METHOD2(list_watchers, int(const std::string& o,
+                                  std::list<obj_watch_t> *out_watchers));
+  int do_list_watchers(const std::string& o,
+                       std::list<obj_watch_t> *out_watchers) {
+    return TestMemIoCtxImpl::list_watchers(o, out_watchers);
+  }
+
   MOCK_METHOD4(read, int(const std::string& oid,
                          size_t len,
                          uint64_t off,
@@ -72,6 +79,14 @@ public:
     return TestMemIoCtxImpl::selfmanaged_snap_remove(snap_id);
   }
 
+  MOCK_METHOD3(truncate, int(const std::string& oid,
+                             uint64_t size,
+                             const SnapContext &snapc));
+  int do_truncate(const std::string& oid, uint64_t size,
+                  const SnapContext &snapc) {
+    return TestMemIoCtxImpl::truncate(oid, size, snapc);
+  }
+
   MOCK_METHOD3(write_full, int(const std::string& oid,
                                bufferlist& bl,
                                const SnapContext &snapc));
@@ -84,10 +99,12 @@ public:
     using namespace ::testing;
 
     ON_CALL(*this, exec(_, _, _, _, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_exec));
+    ON_CALL(*this, list_watchers(_, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_list_watchers));
     ON_CALL(*this, read(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_read));
     ON_CALL(*this, remove(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_remove));
     ON_CALL(*this, selfmanaged_snap_create(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_create));
     ON_CALL(*this, selfmanaged_snap_remove(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_remove));
+    ON_CALL(*this, truncate(_,_,_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_truncate));
     ON_CALL(*this, write_full(_, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_write_full));
   }
 
diff --git a/src/test/librados_test_stub/MockTestMemRadosClient.h b/src/test/librados_test_stub/MockTestMemRadosClient.h
index 1d0b994..9afde33 100644
--- a/src/test/librados_test_stub/MockTestMemRadosClient.h
+++ b/src/test/librados_test_stub/MockTestMemRadosClient.h
@@ -24,10 +24,18 @@ public:
       this, this, pool_id, pool_name, get_pool(pool_name));
   }
 
+  MOCK_METHOD2(blacklist_add, int(const std::string& client_address,
+                                  uint32_t expire_seconds));
+  int do_blacklist_add(const std::string& client_address,
+                       uint32_t expire_seconds) {
+    return TestMemRadosClient::blacklist_add(client_address, expire_seconds);
+  }
+
   void default_to_dispatch() {
     using namespace ::testing;
 
     ON_CALL(*this, create_ioctx(_, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_create_ioctx));
+    ON_CALL(*this, blacklist_add(_, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_blacklist_add));
   }
 };
 
diff --git a/src/test/librados_test_stub/TestRadosClient.h b/src/test/librados_test_stub/TestRadosClient.h
index d3c2034..b1aa75d 100644
--- a/src/test/librados_test_stub/TestRadosClient.h
+++ b/src/test/librados_test_stub/TestRadosClient.h
@@ -9,7 +9,7 @@
 #include "common/Cond.h"
 #include "common/Mutex.h"
 #include "include/atomic.h"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 #include "test/librados_test_stub/TestWatchNotify.h"
 #include <boost/function.hpp>
 #include <boost/functional/hash.hpp>
diff --git a/src/test/libradosstriper/io.cc b/src/test/libradosstriper/io.cc
index 9e56fae..2f931e5 100644
--- a/src/test/libradosstriper/io.cc
+++ b/src/test/libradosstriper/io.cc
@@ -357,11 +357,11 @@ TEST_F(StriperTest, XattrIter) {
       break;
     }
     ASSERT_LT(num_seen, 2) << "Extra attribute : " << name;
-    if ((strcmp(name, "attr1") == 0) && (memcmp(val, attr1_buf, len) == 0)) {
+    if ((strcmp(name, "attr1") == 0) && (val != NULL) && (memcmp(val, attr1_buf, len) == 0)) {
       num_seen++;
       continue;
     }
-    else if ((strcmp(name, "attr2") == 0) && (memcmp(val, attr2_buf, len) == 0)) {
+    else if ((strcmp(name, "attr2") == 0) && (val != NULL) && (memcmp(val, attr2_buf, len) == 0)) {
       num_seen++;
       continue;
     }
diff --git a/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc b/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
new file mode 100644
index 0000000..2732bd8
--- /dev/null
+++ b/src/test/librbd/exclusive_lock/test_mock_AcquireRequest.cc
@@ -0,0 +1,571 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librbd/mock/MockJournal.h"
+#include "test/librbd/mock/MockObjectMap.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "test/librados_test_stub/MockTestMemRadosClient.h"
+#include "cls/lock/cls_lock_ops.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/exclusive_lock/AcquireRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <arpa/inet.h>
+#include <list>
+
+// template definitions
+#include "librbd/exclusive_lock/AcquireRequest.cc"
+template class librbd::exclusive_lock::AcquireRequest<librbd::MockImageCtx>;
+
+namespace librbd {
+namespace exclusive_lock {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::InSequence;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+using ::testing::WithArg;
+
+static const std::string TEST_COOKIE("auto 123");
+
+class TestMockExclusiveLockAcquireRequest : public TestMockFixture {
+public:
+  typedef AcquireRequest<MockImageCtx> MockAcquireRequest;
+  typedef ExclusiveLock<MockImageCtx> MockExclusiveLock;
+
+  void expect_test_features(MockImageCtx &mock_image_ctx, uint64_t features,
+                            bool enabled) {
+    EXPECT_CALL(mock_image_ctx, test_features(features))
+                  .WillOnce(Return(enabled));
+  }
+
+  void expect_lock(MockImageCtx &mock_image_ctx, int r) {
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                exec(mock_image_ctx.header_oid, _, "lock", "lock", _, _, _))
+                  .WillOnce(Return(r));
+  }
+
+  void expect_create_object_map(MockImageCtx &mock_image_ctx,
+                                MockObjectMap *mock_object_map) {
+    EXPECT_CALL(mock_image_ctx, create_object_map(_))
+                  .WillOnce(Return(mock_object_map));
+  }
+
+  void expect_open_object_map(MockImageCtx &mock_image_ctx,
+                              MockObjectMap &mock_object_map) {
+    EXPECT_CALL(mock_object_map, open(_))
+                  .WillOnce(CompleteContext(0, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_lock_object_map(MockImageCtx &mock_image_ctx,
+                              MockObjectMap &mock_object_map) {
+    EXPECT_CALL(mock_object_map, lock(_))
+                  .WillOnce(CompleteContext(0, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_unlock_object_map(MockImageCtx &mock_image_ctx,
+                              MockObjectMap &mock_object_map) {
+    EXPECT_CALL(mock_object_map, unlock(_))
+                  .WillOnce(CompleteContext(0, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_create_journal(MockImageCtx &mock_image_ctx,
+                             MockJournal *mock_journal) {
+    EXPECT_CALL(mock_image_ctx, create_journal())
+                  .WillOnce(Return(mock_journal));
+  }
+
+  void expect_open_journal(MockImageCtx &mock_image_ctx,
+                           MockJournal &mock_journal, int r) {
+    EXPECT_CALL(mock_journal, open(_))
+                  .WillOnce(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_get_lock_info(MockImageCtx &mock_image_ctx, int r,
+                            const entity_name_t &locker_entity,
+                            const std::string &locker_address,
+                            const std::string &locker_cookie,
+                            const std::string &lock_tag,
+                            ClsLockType lock_type) {
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(mock_image_ctx.header_oid, _, "lock",
+                               "get_info", _, _, _));
+    if (r < 0 && r != -ENOENT) {
+      expect.WillOnce(Return(r));
+    } else {
+      entity_name_t entity(locker_entity);
+      entity_addr_t entity_addr;
+      entity_addr.addr.ss_family = AF_INET;
+      inet_pton(AF_INET, locker_address.c_str(), &entity_addr.addr4.sin_addr);
+
+      cls_lock_get_info_reply reply;
+      if (r != -ENOENT) {
+        reply.lockers = decltype(reply.lockers){
+          {rados::cls::lock::locker_id_t(entity, locker_cookie),
+           rados::cls::lock::locker_info_t(utime_t(), entity_addr, "")}};
+        reply.tag = lock_tag;
+        reply.lock_type = lock_type;
+      }
+
+      bufferlist bl;
+      ::encode(reply, bl);
+
+      std::string str(bl.c_str(), bl.length());
+      expect.WillOnce(DoAll(WithArg<5>(CopyInBufferlist(str)), Return(0)));
+    }
+  }
+
+  void expect_list_watchers(MockImageCtx &mock_image_ctx, int r,
+                            const std::string &address, uint64_t watch_handle) {
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               list_watchers(mock_image_ctx.header_oid, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      obj_watch_t watcher;
+      strcpy(watcher.addr, (address + ":0/0").c_str());
+      watcher.cookie = watch_handle;
+
+      std::list<obj_watch_t> watchers;
+      watchers.push_back(watcher);
+
+      expect.WillOnce(DoAll(SetArgPointee<1>(watchers), Return(0)));
+    }
+  }
+
+  void expect_blacklist_add(MockImageCtx &mock_image_ctx, int r) {
+    EXPECT_CALL(get_mock_rados_client(), blacklist_add(_, _))
+                  .WillOnce(Return(r));
+  }
+
+  void expect_break_lock(MockImageCtx &mock_image_ctx, int r) {
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                exec(mock_image_ctx.header_oid, _, "lock", "break_lock", _, _, _))
+                  .WillOnce(Return(r));
+  }
+};
+
+TEST_F(TestMockExclusiveLockAcquireRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, 0);
+
+  MockObjectMap mock_object_map;
+  expect_test_features(mock_image_ctx, RBD_FEATURE_OBJECT_MAP, true);
+  expect_create_object_map(mock_image_ctx, &mock_object_map);
+  expect_open_object_map(mock_image_ctx, mock_object_map);
+  expect_lock_object_map(mock_image_ctx, mock_object_map);
+
+  MockJournal mock_journal;
+  expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
+  expect_create_journal(mock_image_ctx, &mock_journal);
+  expect_open_journal(mock_image_ctx, mock_journal, 0);
+
+  C_SaferCond acquire_ctx;
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &acquire_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(0, acquire_ctx.wait());
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, SuccessJournalDisabled) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, 0);
+
+  MockObjectMap mock_object_map;
+  expect_test_features(mock_image_ctx, RBD_FEATURE_OBJECT_MAP, true);
+  expect_create_object_map(mock_image_ctx, &mock_object_map);
+  expect_open_object_map(mock_image_ctx, mock_object_map);
+  expect_lock_object_map(mock_image_ctx, mock_object_map);
+
+  expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, false);
+
+  C_SaferCond acquire_ctx;
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &acquire_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(0, acquire_ctx.wait());
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, SuccessObjectMapDisabled) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, 0);
+
+  expect_test_features(mock_image_ctx, RBD_FEATURE_OBJECT_MAP, false);
+
+  MockJournal mock_journal;
+  expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
+  expect_create_journal(mock_image_ctx, &mock_journal);
+  expect_open_journal(mock_image_ctx, mock_journal, 0);
+
+  C_SaferCond acquire_ctx;
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &acquire_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(0, acquire_ctx.wait());
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, JournalError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, 0);
+
+  MockObjectMap *mock_object_map = new MockObjectMap();
+  expect_test_features(mock_image_ctx, RBD_FEATURE_OBJECT_MAP, true);
+  expect_create_object_map(mock_image_ctx, mock_object_map);
+  expect_open_object_map(mock_image_ctx, *mock_object_map);
+  expect_lock_object_map(mock_image_ctx, *mock_object_map);
+
+  MockJournal *mock_journal = new MockJournal();
+  expect_test_features(mock_image_ctx, RBD_FEATURE_JOURNALING, true);
+  expect_create_journal(mock_image_ctx, mock_journal);
+  expect_open_journal(mock_image_ctx, *mock_journal, -EINVAL);
+  expect_unlock_object_map(mock_image_ctx, *mock_object_map);
+
+  C_SaferCond acquire_ctx;
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &acquire_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, LockBusy) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, 0, "dead client", 123);
+  expect_blacklist_add(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, 0);
+  expect_lock(mock_image_ctx, -ENOENT);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-ENOENT, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetLockInfoError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, -EINVAL, entity_name_t::CLIENT(1), "",
+                       "", "", LOCK_EXCLUSIVE);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetLockInfoEmpty) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, -ENOENT, entity_name_t::CLIENT(1), "",
+                       "", "", LOCK_EXCLUSIVE);
+  expect_lock(mock_image_ctx, -EINVAL);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetLockInfoExternalTag) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", "external tag", LOCK_EXCLUSIVE);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EBUSY, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetLockInfoShared) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_SHARED);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EBUSY, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetLockInfoExternalCookie) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "external cookie", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EBUSY, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetWatchersError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, -EINVAL, "dead client", 123);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, GetWatchersAlive) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, 0, "1.2.3.4", 123);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EAGAIN, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, BlacklistDisabled) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+  mock_image_ctx.blacklist_on_break_lock = false;
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, 0, "dead client", 123);
+  expect_break_lock(mock_image_ctx, 0);
+  expect_lock(mock_image_ctx, -ENOENT);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-ENOENT, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, BlacklistError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, 0, "dead client", 123);
+  expect_blacklist_add(mock_image_ctx, -EINVAL);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, BreakLockMissing) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, 0, "dead client", 123);
+  expect_blacklist_add(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, -ENOENT);
+  expect_lock(mock_image_ctx, -EINVAL);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockAcquireRequest, BreakLockError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0, entity_name_t::CLIENT(1), "1.2.3.4",
+                       "auto 123", MockExclusiveLock::WATCHER_LOCK_TAG,
+                       LOCK_EXCLUSIVE);
+  expect_list_watchers(mock_image_ctx, 0, "dead client", 123);
+  expect_blacklist_add(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, -EINVAL);
+
+  C_SaferCond ctx;
+  MockAcquireRequest *req = MockAcquireRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
diff --git a/src/test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc b/src/test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc
new file mode 100644
index 0000000..c99b361
--- /dev/null
+++ b/src/test/librbd/exclusive_lock/test_mock_ReleaseRequest.cc
@@ -0,0 +1,197 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librbd/mock/MockJournal.h"
+#include "test/librbd/mock/MockObjectMap.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "librbd/exclusive_lock/ReleaseRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <list>
+
+// template definitions
+#include "librbd/exclusive_lock/ReleaseRequest.cc"
+template class librbd::exclusive_lock::ReleaseRequest<librbd::MockImageCtx>;
+
+namespace librbd {
+namespace exclusive_lock {
+
+using ::testing::_;
+using ::testing::InSequence;
+using ::testing::Return;
+
+static const std::string TEST_COOKIE("auto 123");
+
+class TestMockExclusiveLockReleaseRequest : public TestMockFixture {
+public:
+  typedef ReleaseRequest<MockImageCtx> MockReleaseRequest;
+
+  void expect_block_writes(MockImageCtx &mock_image_ctx, int r) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, block_writes(_))
+                  .WillOnce(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_unblock_writes(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, unblock_writes());
+  }
+
+  void expect_cancel_op_requests(MockImageCtx &mock_image_ctx, int r) {
+    EXPECT_CALL(mock_image_ctx, cancel_async_requests(_))
+                  .WillOnce(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_unlock(MockImageCtx &mock_image_ctx, int r) {
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                exec(mock_image_ctx.header_oid, _, "lock", "unlock", _, _, _))
+                  .WillOnce(Return(r));
+  }
+
+  void expect_close_journal(MockImageCtx &mock_image_ctx,
+                           MockJournal &mock_journal, int r) {
+    EXPECT_CALL(mock_journal, close(_))
+                  .WillOnce(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_unlock_object_map(MockImageCtx &mock_image_ctx,
+                                MockObjectMap &mock_object_map) {
+    EXPECT_CALL(mock_object_map, unlock(_))
+                  .WillOnce(CompleteContext(0, mock_image_ctx.image_ctx->op_work_queue));
+  }
+};
+
+TEST_F(TestMockExclusiveLockReleaseRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx, 0);
+  expect_cancel_op_requests(mock_image_ctx, 0);
+
+  MockJournal *mock_journal = new MockJournal();
+  mock_image_ctx.journal = mock_journal;
+  expect_close_journal(mock_image_ctx, *mock_journal, -EINVAL);
+
+  MockObjectMap *mock_object_map = new MockObjectMap();
+  mock_image_ctx.object_map = mock_object_map;
+  expect_unlock_object_map(mock_image_ctx, *mock_object_map);
+
+  expect_unlock(mock_image_ctx, 0);
+
+  C_SaferCond release_ctx;
+  C_SaferCond ctx;
+  MockReleaseRequest *req = MockReleaseRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &release_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(0, release_ctx.wait());
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockReleaseRequest, SuccessJournalDisabled) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_block_writes(mock_image_ctx, 0);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_cancel_op_requests(mock_image_ctx, 0);
+
+  MockObjectMap *mock_object_map = new MockObjectMap();
+  mock_image_ctx.object_map = mock_object_map;
+  expect_unlock_object_map(mock_image_ctx, *mock_object_map);
+
+  expect_unlock(mock_image_ctx, 0);
+
+  C_SaferCond release_ctx;
+  C_SaferCond ctx;
+  MockReleaseRequest *req = MockReleaseRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &release_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(0, release_ctx.wait());
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockReleaseRequest, SuccessObjectMapDisabled) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_block_writes(mock_image_ctx, 0);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_cancel_op_requests(mock_image_ctx, 0);
+
+  expect_unlock(mock_image_ctx, 0);
+
+  C_SaferCond release_ctx;
+  C_SaferCond ctx;
+  MockReleaseRequest *req = MockReleaseRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       &release_ctx, &ctx);
+  req->send();
+  ASSERT_EQ(0, release_ctx.wait());
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockReleaseRequest, BlockWritesError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx, -EINVAL);
+  expect_unblock_writes(mock_image_ctx);
+
+  C_SaferCond ctx;
+  MockReleaseRequest *req = MockReleaseRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockExclusiveLockReleaseRequest, UnlockError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx, 0);
+  expect_cancel_op_requests(mock_image_ctx, 0);
+
+  expect_unlock(mock_image_ctx, -EINVAL);
+
+  C_SaferCond ctx;
+  MockReleaseRequest *req = MockReleaseRequest::create(mock_image_ctx,
+                                                       TEST_COOKIE,
+                                                       nullptr, &ctx);
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+} // namespace exclusive_lock
+} // namespace librbd
diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc
index e7bfcf1..3c26d5e 100644
--- a/src/test/librbd/fsx.cc
+++ b/src/test/librbd/fsx.cc
@@ -43,6 +43,9 @@
 #include "include/rados/librados.h"
 #include "include/rbd/librbd.h"
 
+#include "common/SubProcess.h"
+#include "common/safe_io.h"
+
 #define NUMPRINTCOLUMNS 32	/* # columns of data to print on each line */
 
 /*
@@ -255,8 +258,8 @@ get_random(void)
 struct rbd_ctx {
 	const char *name;	/* image name */
 	rbd_image_t image;	/* image handle */
-	const char *krbd_name;	/* image /dev/rbd<id> name */
-	int krbd_fd;		/* image /dev/rbd<id> fd */
+	const char *krbd_name;	/* image /dev/rbd<id> name */ /* reused for nbd test */
+	int krbd_fd;		/* image /dev/rbd<id> fd */ /* reused for nbd test */
 };
 
 #define RBD_CTX_INIT	(struct rbd_ctx) { NULL, NULL, NULL, -1 }
@@ -483,7 +486,8 @@ __librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
 		features &= ~(RBD_FEATURE_EXCLUSIVE_LOCK |
 		              RBD_FEATURE_OBJECT_MAP     |
                               RBD_FEATURE_FAST_DIFF      |
-                              RBD_FEATURE_DEEP_FLATTEN);
+                              RBD_FEATURE_DEEP_FLATTEN   |
+                              RBD_FEATURE_JOURNALING);
 	}
 	ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
 			 dst_imagename, features, order,
@@ -791,6 +795,127 @@ const struct rbd_operations krbd_operations = {
 	krbd_flatten,
 };
 
+int
+nbd_open(const char *name, struct rbd_ctx *ctx)
+{
+	int r;
+	int fd;
+	char dev[4096];
+	char *devnode;
+
+	SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE);
+	process.add_cmd_arg("map");
+	std::string img;
+	img.append(pool);
+	img.append("/");
+	img.append(name);
+	process.add_cmd_arg(img.c_str());
+
+	r = __librbd_open(name, ctx);
+	if (r < 0)
+		return r;
+
+        r = process.spawn();
+        if (r < 0) {
+		prt("nbd_open failed to run rbd-nbd error: %s\n", process.err());
+		return r;
+        }
+	r = safe_read(process.get_stdout(), dev, sizeof(dev));
+	if (r < 0) {
+		prt("nbd_open failed to get nbd device path\n");
+		return r;
+	}
+	for (int i = 0; i < r; ++i)
+	  if (dev[i] == 10 || dev[i] == 13)
+	    dev[i] = 0;
+	dev[r] = 0;
+	r = process.join();
+	if (r) {
+		prt("rbd-nbd failed with error: %s", process.err());
+		return -EINVAL;
+	}
+
+	devnode = strdup(dev);
+	if (!devnode)
+		return -ENOMEM;
+
+	fd = open(devnode, O_RDWR | o_direct);
+	if (fd < 0) {
+		r = -errno;
+		prt("open(%s) failed\n", devnode);
+		return r;
+	}
+
+	ctx->krbd_name = devnode;
+	ctx->krbd_fd = fd;
+
+	return 0;
+}
+
+int
+nbd_close(struct rbd_ctx *ctx)
+{
+	int r;
+
+	assert(ctx->krbd_name && ctx->krbd_fd >= 0);
+
+	if (close(ctx->krbd_fd) < 0) {
+		r = -errno;
+		prt("close(%s) failed\n", ctx->krbd_name);
+		return r;
+	}
+
+	SubProcess process("rbd-nbd");
+	process.add_cmd_arg("unmap");
+	process.add_cmd_arg(ctx->krbd_name);
+
+        r = process.spawn();
+        if (r < 0) {
+		prt("nbd_close failed to run rbd-nbd error: %s\n", process.err());
+		return r;
+        }
+	r = process.join();
+	if (r) {
+		prt("rbd-nbd failed with error: %d", process.err());
+		return -EINVAL;
+	}
+
+	free((void *)ctx->krbd_name);
+
+	ctx->krbd_name = NULL;
+	ctx->krbd_fd = -1;
+
+	return __librbd_close(ctx);
+}
+
+int
+nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
+	  const char *dst_imagename, int *order, int stripe_unit,
+	  int stripe_count)
+{
+	int ret;
+
+	ret = __krbd_flush(ctx, false);
+	if (ret < 0)
+		return ret;
+
+	return __librbd_clone(ctx, src_snapname, dst_imagename, order,
+			      stripe_unit, stripe_count, false);
+}
+
+const struct rbd_operations nbd_operations = {
+	nbd_open,
+	nbd_close,
+	krbd_read,
+	krbd_write,
+	krbd_flush,
+	krbd_discard,
+	krbd_get_size,
+	krbd_resize,
+	nbd_clone,
+	krbd_flatten,
+};
+
 struct rbd_ctx ctx = RBD_CTX_INIT;
 const struct rbd_operations *ops = &librbd_operations;
 
@@ -1824,6 +1949,7 @@ usage(void)
 #endif
 "	-H: do not use punch hole calls\n\
 	-K: enable krbd mode (use -t and -h too)\n\
+	-M: enable rbd-nbd mode (use -t and -h too)\n\
 	-L: fsxLite - no file creations & no file size changes\n\
 	-N numops: total # operations to do (default infinity)\n\
 	-O: use oplen (see -o flag) for every op (default random)\n\
@@ -2010,7 +2136,7 @@ main(int argc, char **argv)
 
 	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
 
-	while ((ch = getopt(argc, argv, "b:c:dfh:l:m:no:p:qr:s:t:w:xyACD:FHKLN:OP:RS:UWZ"))
+	while ((ch = getopt(argc, argv, "b:c:dfh:l:m:no:p:qr:s:t:w:xyACD:FHKMLN:OP:RS:UWZ"))
 	       != EOF)
 		switch (ch) {
 		case 'b':
@@ -2126,6 +2252,10 @@ main(int argc, char **argv)
 			prt("krbd mode enabled\n");
 			ops = &krbd_operations;
 			break;
+		case 'M':
+			prt("rbd-nbd mode enabled\n");
+			ops = &nbd_operations;
+			break;
 		case 'L':
 			prt("lite mode not supported for rbd\n");
 			exit(1);
diff --git a/src/test/librbd/mock/MockAioImageRequestWQ.h b/src/test/librbd/mock/MockAioImageRequestWQ.h
new file mode 100644
index 0000000..c4dbd3b
--- /dev/null
+++ b/src/test/librbd/mock/MockAioImageRequestWQ.h
@@ -0,0 +1,20 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_AIO_IMAGE_REQUEST_WQ_H
+#define CEPH_TEST_LIBRBD_MOCK_AIO_IMAGE_REQUEST_WQ_H
+
+#include "gmock/gmock.h"
+
+namespace librbd {
+
+struct MockAioImageRequestWQ {
+  MOCK_METHOD1(block_writes, void(Context *));
+  MOCK_METHOD0(unblock_writes, void());
+
+  MOCK_CONST_METHOD0(writes_empty, bool());
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_AIO_IMAGE_REQUEST_WQ_H
diff --git a/src/test/librbd/mock/MockExclusiveLock.h b/src/test/librbd/mock/MockExclusiveLock.h
new file mode 100644
index 0000000..8227d3e
--- /dev/null
+++ b/src/test/librbd/mock/MockExclusiveLock.h
@@ -0,0 +1,25 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_EXCLUSIVE_LOCK_H
+#define CEPH_TEST_LIBRBD_MOCK_EXCLUSIVE_LOCK_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "gmock/gmock.h"
+
+class Context;
+
+namespace librbd {
+
+struct MockExclusiveLock {
+  MOCK_CONST_METHOD0(is_lock_owner, bool());
+
+  MOCK_METHOD1(assert_header_locked, void(librados::ObjectWriteOperation *));
+
+  MOCK_METHOD1(shut_down, void(Context*));
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_EXCLUSIVE_LOCK_H
diff --git a/src/test/librbd/mock/MockImageCtx.h b/src/test/librbd/mock/MockImageCtx.h
index 53d6fd0..c196920 100644
--- a/src/test/librbd/mock/MockImageCtx.h
+++ b/src/test/librbd/mock/MockImageCtx.h
@@ -4,10 +4,15 @@
 #ifndef CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
 #define CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
 
+#include "test/librbd/mock/MockAioImageRequestWQ.h"
 #include "test/librbd/mock/MockContextWQ.h"
+#include "test/librbd/mock/MockExclusiveLock.h"
 #include "test/librbd/mock/MockImageWatcher.h"
+#include "test/librbd/mock/MockJournal.h"
 #include "test/librbd/mock/MockObjectMap.h"
+#include "test/librbd/mock/MockReadahead.h"
 #include "common/RWLock.h"
+#include "common/WorkQueue.h"
 #include "librbd/ImageCtx.h"
 #include "gmock/gmock.h"
 
@@ -33,10 +38,14 @@ struct MockImageCtx {
       header_oid(image_ctx.header_oid),
       id(image_ctx.id),
       parent_md(image_ctx.parent_md),
-      aio_work_queue(new MockContextWQ()),
+      layout(image_ctx.layout),
+      aio_work_queue(new MockAioImageRequestWQ()),
       op_work_queue(new MockContextWQ()),
-      image_watcher(NULL),
-      concurrent_management_ops(image_ctx.concurrent_management_ops)
+      parent(NULL), image_watcher(NULL), object_map(NULL),
+      exclusive_lock(NULL), journal(NULL),
+      concurrent_management_ops(image_ctx.concurrent_management_ops),
+      blacklist_on_break_lock(image_ctx.blacklist_on_break_lock),
+      blacklist_expire_seconds(image_ctx.blacklist_expire_seconds)
   {
     md_ctx.dup(image_ctx.md_ctx);
     data_ctx.dup(image_ctx.data_ctx);
@@ -47,11 +56,30 @@ struct MockImageCtx {
   }
 
   ~MockImageCtx() {
+    wait_for_async_requests();
+    image_ctx->md_ctx.aio_flush();
+    image_ctx->data_ctx.aio_flush();
+    image_ctx->op_work_queue->drain();
     delete image_watcher;
     delete op_work_queue;
     delete aio_work_queue;
   }
 
+  void wait_for_async_requests() {
+    async_ops_lock.Lock();
+    if (async_requests.empty()) {
+      async_ops_lock.Unlock();
+      return;
+    }
+
+    C_SaferCond ctx;
+    async_requests_waiters.push_back(&ctx);
+    async_ops_lock.Unlock();
+
+    ctx.wait();
+  }
+
+  MOCK_CONST_METHOD1(get_image_size, uint64_t(librados::snap_t));
   MOCK_CONST_METHOD1(get_snap_id, librados::snap_t(std::string in_snap_name));
   MOCK_CONST_METHOD1(get_snap_info, const SnapInfo*(librados::snap_t));
   MOCK_CONST_METHOD2(get_parent_spec, int(librados::snap_t in_snap_id,
@@ -66,7 +94,18 @@ struct MockImageCtx {
                               uint64_t in_size, parent_info parent,
                               uint8_t protection_status, uint64_t flags));
   MOCK_METHOD2(rm_snap, void(std::string in_snap_name, librados::snap_t id));
+
   MOCK_METHOD1(flush, void(Context *));
+  MOCK_METHOD1(flush_copyup, void(Context *));
+
+  MOCK_METHOD1(shut_down_cache, void(Context *));
+
+  MOCK_CONST_METHOD1(test_features, bool(uint64_t test_features));
+
+  MOCK_METHOD1(cancel_async_requests, void(Context*));
+
+  MOCK_METHOD1(create_object_map, MockObjectMap*(uint64_t));
+  MOCK_METHOD0(create_journal, MockJournal*());
 
   ImageCtx *image_ctx;
   CephContext *cct;
@@ -95,16 +134,26 @@ struct MockImageCtx {
   std::string id;
   parent_info parent_md;
 
+  ceph_file_layout layout;
+
   xlist<AsyncRequest<MockImageCtx>*> async_requests;
-  Cond async_requests_cond;
+  std::list<Context*> async_requests_waiters;
 
-  MockContextWQ *aio_work_queue;
+  MockAioImageRequestWQ *aio_work_queue;
   MockContextWQ *op_work_queue;
 
+  MockReadahead readahead;
+
+  MockImageCtx *parent;
+
   MockImageWatcher *image_watcher;
-  MockObjectMap object_map;
+  MockObjectMap *object_map;
+  MockExclusiveLock *exclusive_lock;
+  MockJournal *journal;
 
   int concurrent_management_ops;
+  bool blacklist_on_break_lock;
+  uint32_t blacklist_expire_seconds;
 };
 
 } // namespace librbd
diff --git a/src/test/librbd/mock/MockImageWatcher.h b/src/test/librbd/mock/MockImageWatcher.h
index 1c339bc..20164ea 100644
--- a/src/test/librbd/mock/MockImageWatcher.h
+++ b/src/test/librbd/mock/MockImageWatcher.h
@@ -9,9 +9,13 @@
 namespace librbd {
 
 struct MockImageWatcher {
-  MOCK_CONST_METHOD0(is_lock_owner, bool());
-  MOCK_CONST_METHOD1(is_lock_supported, bool(const RWLock &));
-  MOCK_METHOD1(assert_header_locked, void (librados::ObjectWriteOperation *));
+  MOCK_METHOD0(unregister_watch, void());
+
+  MOCK_CONST_METHOD0(get_watch_handle, uint64_t());
+
+  MOCK_METHOD0(notify_acquired_lock, void());
+  MOCK_METHOD0(notify_released_lock, void());
+  MOCK_METHOD0(notify_request_lock, void());
 };
 
 } // namespace librbd
diff --git a/src/test/librbd/mock/MockJournal.h b/src/test/librbd/mock/MockJournal.h
new file mode 100644
index 0000000..e0c8d1f
--- /dev/null
+++ b/src/test/librbd/mock/MockJournal.h
@@ -0,0 +1,28 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_JOURNAL_H
+#define CEPH_TEST_LIBRBD_MOCK_JOURNAL_H
+
+#include "gmock/gmock.h"
+#include "librbd/JournalTypes.h"
+#include "librbd/Journal.h"
+
+namespace librbd {
+
+struct MockJournal {
+  MOCK_CONST_METHOD0(is_journal_ready, bool());
+  MOCK_CONST_METHOD0(is_journal_replaying, bool());
+
+  MOCK_METHOD1(wait_for_journal_ready, void(Context *));
+
+  MOCK_METHOD1(open, void(Context *));
+  MOCK_METHOD1(close, void(Context *));
+
+  MOCK_METHOD1(append_op_event, uint64_t(journal::EventEntry&));
+  MOCK_METHOD2(commit_op_event, void(uint64_t, int));
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_JOURNAL_H
diff --git a/src/test/librbd/mock/MockObjectMap.h b/src/test/librbd/mock/MockObjectMap.h
index 7f2f84b..78c036c 100644
--- a/src/test/librbd/mock/MockObjectMap.h
+++ b/src/test/librbd/mock/MockObjectMap.h
@@ -11,6 +11,11 @@ namespace librbd {
 struct MockObjectMap {
   MOCK_CONST_METHOD1(enabled, bool(const RWLock &object_map_lock));
 
+  MOCK_METHOD1(open, void(Context *on_finish));
+
+  MOCK_METHOD1(lock, void(Context *on_finish));
+  MOCK_METHOD1(unlock, void(Context *on_finish));
+
   MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish));
   MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish));
 };
diff --git a/src/test/librbd/mock/MockReadahead.h b/src/test/librbd/mock/MockReadahead.h
new file mode 100644
index 0000000..b73b462
--- /dev/null
+++ b/src/test/librbd/mock/MockReadahead.h
@@ -0,0 +1,21 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_READAHEAD_H
+#define CEPH_TEST_LIBRBD_MOCK_READAHEAD_H
+
+#include "include/int_types.h"
+#include "gmock/gmock.h"
+
+class Context;
+
+namespace librbd {
+
+struct MockReadahead {
+  MOCK_METHOD1(set_max_readahead_size, void(uint64_t));
+  MOCK_METHOD1(wait_for_pending, void(Context *));
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_READAHEAD_H
diff --git a/src/test/librbd/object_map/mock/MockInvalidateRequest.h b/src/test/librbd/object_map/mock/MockInvalidateRequest.h
new file mode 100644
index 0000000..b7d02c6
--- /dev/null
+++ b/src/test/librbd/object_map/mock/MockInvalidateRequest.h
@@ -0,0 +1,42 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/object_map/InvalidateRequest.h"
+
+// template definitions
+#include "librbd/object_map/InvalidateRequest.cc"
+
+namespace librbd {
+namespace object_map {
+
+template <>
+struct InvalidateRequest<MockImageCtx> {
+  static std::list<InvalidateRequest *> s_requests;
+  uint64_t snap_id;
+  bool force;
+  Context *on_finish;
+
+  static InvalidateRequest* create(MockImageCtx &image_ctx, uint64_t snap_id,
+                                   bool force, Context *on_finish) {
+    assert(!s_requests.empty());
+    InvalidateRequest* req = s_requests.front();
+    req->snap_id = snap_id;
+    req->force = force;
+    req->on_finish = on_finish;
+    s_requests.pop_front();
+    return req;
+  }
+
+  InvalidateRequest() {
+    s_requests.push_back(this);
+  }
+
+  MOCK_METHOD0(send, void());
+};
+
+typedef InvalidateRequest<MockImageCtx> MockInvalidateRequest;
+
+std::list<InvalidateRequest<MockImageCtx>*> InvalidateRequest<MockImageCtx>::s_requests;
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_InvalidateRequest.cc b/src/test/librbd/object_map/test_mock_InvalidateRequest.cc
new file mode 100644
index 0000000..9c0f4ea
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_InvalidateRequest.cc
@@ -0,0 +1,153 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "librbd/internal.h"
+#include "librbd/object_map/InvalidateRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Return;
+
+class TestMockObjectMapInvalidateRequest : public TestMockFixture {
+public:
+};
+
+TEST_F(TestMockObjectMapInvalidateRequest, UpdatesInMemoryFlag) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_FALSE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new InvalidateRequest<>(*ictx, CEPH_NOSNAP, false, &cond_ctx);
+
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                .Times(0);
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+TEST_F(TestMockObjectMapInvalidateRequest, UpdatesHeadOnDiskFlag) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new InvalidateRequest<>(*ictx, CEPH_NOSNAP, false, &cond_ctx);
+
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _))
+                .WillOnce(DoDefault());
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                .WillOnce(DoDefault());
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapInvalidateRequest, UpdatesSnapOnDiskFlag) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, librbd::snap_set(ictx, "snap1"));
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new InvalidateRequest<>(*ictx, ictx->snap_id, false,
+                                                &cond_ctx);
+
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _))
+                .Times(0);
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                .WillOnce(DoDefault());
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockObjectMapInvalidateRequest, SkipOnDiskUpdateWithoutLock) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new InvalidateRequest<>(*ictx, CEPH_NOSNAP, false, &cond_ctx);
+
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                .Times(0);
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapInvalidateRequest, IgnoresOnDiskUpdateFailure) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new InvalidateRequest<>(*ictx, CEPH_NOSNAP, false, &cond_ctx);
+
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _))
+                .WillOnce(DoDefault());
+  EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+              exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                .WillOnce(Return(-EINVAL));
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_LockRequest.cc b/src/test/librbd/object_map/test_mock_LockRequest.cc
new file mode 100644
index 0000000..e7a62fd
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_LockRequest.cc
@@ -0,0 +1,215 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "cls/lock/cls_lock_ops.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/LockRequest.h"
+
+// template definitions
+#include "librbd/object_map/LockRequest.cc"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::InSequence;
+using ::testing::Return;
+using ::testing::WithArg;
+
+class TestMockObjectMapLockRequest : public TestMockFixture {
+public:
+  typedef LockRequest<MockImageCtx> MockLockRequest;
+
+  void expect_lock(MockImageCtx &mock_image_ctx, int r) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, CEPH_NOSNAP));
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                exec(oid, _, "lock", "lock", _, _, _))
+                  .WillOnce(Return(r));
+  }
+
+  void expect_get_lock_info(MockImageCtx &mock_image_ctx, int r) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, CEPH_NOSNAP));
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(oid, _, "lock", "get_info", _, _, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      entity_name_t entity1(entity_name_t::CLIENT(1));
+      entity_name_t entity2(entity_name_t::CLIENT(2));
+
+      cls_lock_get_info_reply reply;
+      reply.lockers = decltype(reply.lockers){
+        {rados::cls::lock::locker_id_t(entity1, "cookie1"),
+         rados::cls::lock::locker_info_t()},
+        {rados::cls::lock::locker_id_t(entity2, "cookie2"),
+         rados::cls::lock::locker_info_t()}};
+
+      bufferlist bl;
+      ::encode(reply, bl);
+
+      std::string str(bl.c_str(), bl.length());
+      expect.WillOnce(DoAll(WithArg<5>(CopyInBufferlist(str)), Return(r)));
+    }
+  }
+
+  void expect_break_lock(MockImageCtx &mock_image_ctx, int r) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, CEPH_NOSNAP));
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(oid, _, "lock", "break_lock", _, _, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.Times(2).WillRepeatedly(Return(0));
+    }
+  }
+};
+
+TEST_F(TestMockObjectMapLockRequest, Success) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, 0);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, LockBusy) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, 0);
+  expect_lock(mock_image_ctx, 0);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, LockError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -ENOENT);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, GetLockInfoMissing) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, -ENOENT);
+  expect_lock(mock_image_ctx, 0);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, GetLockInfoError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, -EINVAL);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, BreakLockMissing) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, -ENOENT);
+  expect_lock(mock_image_ctx, 0);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, BreakLockError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, -EINVAL);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapLockRequest, LockErrorAfterBrokeLock) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockLockRequest *req = new MockLockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_lock(mock_image_ctx, -EBUSY);
+  expect_get_lock_info(mock_image_ctx, 0);
+  expect_break_lock(mock_image_ctx, 0);
+  expect_lock(mock_image_ctx, -EBUSY);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_RefreshRequest.cc b/src/test/librbd/object_map/test_mock_RefreshRequest.cc
new file mode 100644
index 0000000..2af4517
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_RefreshRequest.cc
@@ -0,0 +1,251 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librbd/object_map/mock/MockInvalidateRequest.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/RefreshRequest.h"
+
+// template definitions
+#include "librbd/object_map/RefreshRequest.cc"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::DoDefault;
+using ::testing::InSequence;
+using ::testing::Return;
+using ::testing::WithArg;
+
+class TestMockObjectMapRefreshRequest : public TestMockFixture {
+public:
+  static const uint64_t TEST_SNAP_ID = 123;
+
+  typedef RefreshRequest<MockImageCtx> MockRefreshRequest;
+
+  void expect_object_map_load(MockImageCtx &mock_image_ctx,
+                              ceph::BitVector<2> *object_map, int r) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, TEST_SNAP_ID));
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(oid, _, "rbd", "object_map_load", _, _, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      assert(object_map);
+      object_map->set_crc_enabled(false);
+
+      bufferlist bl;
+      ::encode(*object_map, bl);
+
+      std::string str(bl.c_str(), bl.length());
+      expect.WillOnce(DoAll(WithArg<5>(CopyInBufferlist(str)), Return(0)));
+    }
+  }
+
+  void expect_get_image_size(MockImageCtx &mock_image_ctx, uint64_t size) {
+    EXPECT_CALL(mock_image_ctx, get_image_size(TEST_SNAP_ID))
+                  .WillOnce(Return(size));
+  }
+
+  void expect_invalidate_request(MockImageCtx &mock_image_ctx,
+                                 MockInvalidateRequest &invalidate_request) {
+    EXPECT_CALL(invalidate_request, send())
+                  .WillOnce(FinishRequest(&invalidate_request, 0,
+                                          &mock_image_ctx));
+  }
+
+  void expect_truncate_request(MockImageCtx &mock_image_ctx) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, TEST_SNAP_ID));
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx), truncate(oid, 0, _))
+                  .WillOnce(Return(0));
+  }
+
+  void expect_object_map_resize(MockImageCtx &mock_image_ctx,
+                                uint64_t num_objects, int r) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, TEST_SNAP_ID));
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(oid, _, "rbd", "object_map_resize", _, _, _));
+    expect.WillOnce(Return(r));
+  }
+
+  void init_object_map(MockImageCtx &mock_image_ctx,
+                       ceph::BitVector<2> *object_map) {
+    uint64_t num_objs = Striper::get_num_objects(
+      mock_image_ctx.layout, mock_image_ctx.image_ctx->size);
+    object_map->resize(num_objs);
+    for (uint64_t i = 0; i < num_objs; ++i) {
+      (*object_map)[i] = rand() % 3;
+    }
+  }
+};
+
+TEST_F(TestMockObjectMapRefreshRequest, Success) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  ceph::BitVector<2> on_disk_object_map;
+  init_object_map(mock_image_ctx, &on_disk_object_map);
+
+  C_SaferCond ctx;
+  ceph::BitVector<2> object_map;
+  MockRefreshRequest *req = new MockRefreshRequest(mock_image_ctx, &object_map,
+                                                   TEST_SNAP_ID, &ctx);
+
+  InSequence seq;
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  expect_object_map_load(mock_image_ctx, &on_disk_object_map, 0);
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+
+  ASSERT_EQ(on_disk_object_map, object_map);
+}
+
+TEST_F(TestMockObjectMapRefreshRequest, LoadError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  ceph::BitVector<2> on_disk_object_map;
+  init_object_map(mock_image_ctx, &on_disk_object_map);
+
+  C_SaferCond ctx;
+  ceph::BitVector<2> object_map;
+  MockRefreshRequest *req = new MockRefreshRequest(mock_image_ctx, &object_map,
+                                                   TEST_SNAP_ID, &ctx);
+
+  InSequence seq;
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  expect_object_map_load(mock_image_ctx, nullptr, -ENOENT);
+
+  MockInvalidateRequest invalidate_request;
+  expect_invalidate_request(mock_image_ctx, invalidate_request);
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapRefreshRequest, LoadCorrupt) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  ceph::BitVector<2> on_disk_object_map;
+  init_object_map(mock_image_ctx, &on_disk_object_map);
+
+  C_SaferCond ctx;
+  ceph::BitVector<2> object_map;
+  MockRefreshRequest *req = new MockRefreshRequest(mock_image_ctx, &object_map,
+                                                   TEST_SNAP_ID, &ctx);
+
+  InSequence seq;
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  expect_object_map_load(mock_image_ctx, nullptr, -EINVAL);
+
+  MockInvalidateRequest invalidate_request;
+  expect_invalidate_request(mock_image_ctx, invalidate_request);
+  expect_truncate_request(mock_image_ctx);
+  expect_object_map_resize(mock_image_ctx, on_disk_object_map.size(), 0);
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapRefreshRequest, TooSmall) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  ceph::BitVector<2> on_disk_object_map;
+  init_object_map(mock_image_ctx, &on_disk_object_map);
+
+  ceph::BitVector<2> small_object_map;
+
+  C_SaferCond ctx;
+  ceph::BitVector<2> object_map;
+  MockRefreshRequest *req = new MockRefreshRequest(mock_image_ctx, &object_map,
+                                                   TEST_SNAP_ID, &ctx);
+
+  InSequence seq;
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  expect_object_map_load(mock_image_ctx, &small_object_map, 0);
+
+  MockInvalidateRequest invalidate_request;
+  expect_invalidate_request(mock_image_ctx, invalidate_request);
+  expect_object_map_resize(mock_image_ctx, on_disk_object_map.size(), 0);
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapRefreshRequest, TooLarge) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  ceph::BitVector<2> on_disk_object_map;
+  init_object_map(mock_image_ctx, &on_disk_object_map);
+
+  ceph::BitVector<2> large_object_map;
+  large_object_map.resize(on_disk_object_map.size() * 2);
+
+  C_SaferCond ctx;
+  ceph::BitVector<2> object_map;
+  MockRefreshRequest *req = new MockRefreshRequest(mock_image_ctx, &object_map,
+                                                   TEST_SNAP_ID, &ctx);
+
+  InSequence seq;
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  expect_object_map_load(mock_image_ctx, &large_object_map, 0);
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapRefreshRequest, ResizeError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  ceph::BitVector<2> on_disk_object_map;
+  init_object_map(mock_image_ctx, &on_disk_object_map);
+
+  ceph::BitVector<2> small_object_map;
+
+  C_SaferCond ctx;
+  ceph::BitVector<2> object_map;
+  MockRefreshRequest *req = new MockRefreshRequest(mock_image_ctx, &object_map,
+                                                   TEST_SNAP_ID, &ctx);
+
+  InSequence seq;
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+  expect_object_map_load(mock_image_ctx, &small_object_map, 0);
+
+  MockInvalidateRequest invalidate_request;
+  expect_invalidate_request(mock_image_ctx, invalidate_request);
+  expect_object_map_resize(mock_image_ctx, on_disk_object_map.size(), -ESTALE);
+  expect_get_image_size(mock_image_ctx, mock_image_ctx.image_ctx->size);
+
+  req->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+} // namespace object_map
+} // namespace librbd
+
diff --git a/src/test/librbd/object_map/test_mock_ResizeRequest.cc b/src/test/librbd/object_map/test_mock_ResizeRequest.cc
new file mode 100644
index 0000000..197fabe
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_ResizeRequest.cc
@@ -0,0 +1,144 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/ResizeRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Return;
+
+class TestMockObjectMapResizeRequest : public TestMockFixture {
+public:
+  void expect_resize(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+    std::string oid(ObjectMap::object_map_name(ictx->id, snap_id));
+    if (snap_id == CEPH_NOSNAP) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "lock", "assert_locked", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "rbd", "object_map_resize", _, _, _))
+                    .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "rbd", "object_map_resize", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_invalidate(librbd::ImageCtx *ictx) {
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _)).Times(0);
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                  .WillOnce(DoDefault());
+  }
+};
+
+TEST_F(TestMockObjectMapResizeRequest, UpdateInMemory) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new ResizeRequest(
+    *ictx, &object_map, CEPH_NOSNAP, object_map.size(), OBJECT_EXISTS,
+    &cond_ctx);
+  req->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  for (uint64_t i = 0; i < object_map.size(); ++i) {
+    ASSERT_EQ(i == 0 ? OBJECT_NONEXISTENT : OBJECT_EXISTS,
+              object_map[i]);
+  }
+}
+
+TEST_F(TestMockObjectMapResizeRequest, UpdateHeadOnDisk) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  expect_resize(ictx, CEPH_NOSNAP, 0);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new ResizeRequest(
+    *ictx, &object_map, CEPH_NOSNAP, object_map.size(), OBJECT_EXISTS,
+    &cond_ctx);
+  req->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapResizeRequest, UpdateSnapOnDisk) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, librbd::snap_set(ictx, "snap1"));
+
+  uint64_t snap_id = ictx->snap_id;
+  expect_resize(ictx, snap_id, 0);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new ResizeRequest(
+    *ictx, &object_map, snap_id, object_map.size(), OBJECT_EXISTS,
+    &cond_ctx);
+  req->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapResizeRequest, UpdateOnDiskError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  expect_resize(ictx, CEPH_NOSNAP, -EINVAL);
+  expect_invalidate(ictx);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new ResizeRequest(
+    *ictx, &object_map, CEPH_NOSNAP, object_map.size(), OBJECT_EXISTS,
+    &cond_ctx);
+  req->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_SnapshotCreateRequest.cc b/src/test/librbd/object_map/test_mock_SnapshotCreateRequest.cc
new file mode 100644
index 0000000..e76c192
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_SnapshotCreateRequest.cc
@@ -0,0 +1,221 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/SnapshotCreateRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Return;
+
+class TestMockObjectMapSnapshotCreateRequest : public TestMockFixture {
+public:
+  void inject_snap_info(librbd::ImageCtx *ictx, uint64_t snap_id) {
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    RWLock::RLocker parent_locker(ictx->parent_lock);
+    ictx->add_snap("snap name", snap_id, ictx->size, ictx->parent_md,
+                   RBD_PROTECTION_STATUS_UNPROTECTED, 0);
+  }
+
+  void expect_read_map(librbd::ImageCtx *ictx, int r) {
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  read(ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP),
+                       0, 0, _)).WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  read(ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP),
+                       0, 0, _)).WillOnce(DoDefault());
+    }
+  }
+
+  void expect_write_map(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  write_full(
+                    ObjectMap::object_map_name(ictx->id, snap_id), _, _))
+                  .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  write_full(
+                    ObjectMap::object_map_name(ictx->id, snap_id), _, _))
+                  .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_add_snapshot(librbd::ImageCtx *ictx, int r) {
+    std::string oid(ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP));
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "lock", "assert_locked", _, _, _))
+                    .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "lock", "assert_locked", _, _, _))
+                    .WillOnce(DoDefault());
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "rbd", "object_map_snap_add", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_invalidate(librbd::ImageCtx *ictx) {
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                  .WillOnce(DoDefault());
+  }
+};
+
+TEST_F(TestMockObjectMapSnapshotCreateRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+
+  uint64_t snap_id = 1;
+  inject_snap_info(ictx, snap_id);
+  expect_read_map(ictx, 0);
+  expect_write_map(ictx, snap_id, 0);
+  if (ictx->test_features(RBD_FEATURE_FAST_DIFF)) {
+    expect_add_snapshot(ictx, 0);
+  }
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotCreateRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotCreateRequest, ReadMapError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+
+  uint64_t snap_id = 1;
+  inject_snap_info(ictx, snap_id);
+  expect_read_map(ictx, -ENOENT);
+  expect_invalidate(ictx);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotCreateRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotCreateRequest, WriteMapError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+
+  uint64_t snap_id = 1;
+  inject_snap_info(ictx, snap_id);
+  expect_read_map(ictx, 0);
+  expect_write_map(ictx, snap_id, -EINVAL);
+  expect_invalidate(ictx);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotCreateRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotCreateRequest, AddSnapshotError) {
+  REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+
+  uint64_t snap_id = 1;
+  inject_snap_info(ictx, snap_id);
+  expect_read_map(ictx, 0);
+  expect_write_map(ictx, snap_id, 0);
+  expect_add_snapshot(ictx, -EINVAL);
+  expect_invalidate(ictx);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotCreateRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotCreateRequest, FlagCleanObjects) {
+  REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1024);
+  for (uint64_t i = 0; i < object_map.size(); ++i) {
+    object_map[i] = i % 2 == 0 ? OBJECT_EXISTS : OBJECT_NONEXISTENT;
+  }
+
+  uint64_t snap_id = 1;
+  inject_snap_info(ictx, snap_id);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotCreateRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  for (uint64_t i = 0; i < object_map.size(); ++i) {
+    ASSERT_EQ(i % 2 == 0 ? OBJECT_EXISTS_CLEAN : OBJECT_NONEXISTENT,
+              object_map[i]);
+  }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc b/src/test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc
new file mode 100644
index 0000000..224a43d
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_SnapshotRemoveRequest.cc
@@ -0,0 +1,276 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/SnapshotRemoveRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Return;
+
+class TestMockObjectMapSnapshotRemoveRequest : public TestMockFixture {
+public:
+  void expect_load_map(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+    std::string snap_oid(ObjectMap::object_map_name(ictx->id, snap_id));
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(snap_oid, _, "rbd", "object_map_load", _, _, _))
+                    .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(snap_oid, _, "rbd", "object_map_load", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_remove_snapshot(librbd::ImageCtx *ictx, int r) {
+    std::string oid(ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP));
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "lock", "assert_locked", _, _, _))
+                    .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "lock", "assert_locked", _, _, _))
+                    .WillOnce(DoDefault());
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "rbd", "object_map_snap_remove", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_remove_map(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+    std::string snap_oid(ObjectMap::object_map_name(ictx->id, snap_id));
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx), remove(snap_oid))
+                    .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx), remove(snap_oid))
+                    .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_invalidate(librbd::ImageCtx *ictx) {
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _))
+                  .Times(0);
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                  .WillOnce(DoDefault());
+  }
+};
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  if (ictx->test_features(RBD_FEATURE_FAST_DIFF)) {
+    expect_load_map(ictx, snap_id, 0);
+    expect_remove_snapshot(ictx, 0);
+  }
+  expect_remove_map(ictx, snap_id, 0);
+
+  ceph::BitVector<2> object_map;
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, LoadMapError) {
+  REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_load_map(ictx, snap_id, -EINVAL);
+  expect_invalidate(ictx);
+  expect_remove_map(ictx, snap_id, 0);
+
+  ceph::BitVector<2> object_map;
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, RemoveSnapshotMissing) {
+  REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_load_map(ictx, snap_id, 0);
+  expect_remove_snapshot(ictx, -ENOENT);
+  expect_remove_map(ictx, snap_id, 0);
+
+  ceph::BitVector<2> object_map;
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, RemoveSnapshotError) {
+  REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_load_map(ictx, snap_id, 0);
+  expect_remove_snapshot(ictx, -EINVAL);
+  expect_invalidate(ictx);
+  expect_remove_map(ictx, snap_id, 0);
+
+  ceph::BitVector<2> object_map;
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, RemoveMapMissing) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  if (ictx->test_features(RBD_FEATURE_FAST_DIFF)) {
+    expect_load_map(ictx, snap_id, 0);
+    expect_remove_snapshot(ictx, 0);
+  }
+  expect_remove_map(ictx, snap_id, -ENOENT);
+
+  ceph::BitVector<2> object_map;
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, RemoveMapError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  if (ictx->test_features(RBD_FEATURE_FAST_DIFF)) {
+    expect_load_map(ictx, snap_id, 0);
+    expect_remove_snapshot(ictx, 0);
+  }
+  expect_remove_map(ictx, snap_id, -EINVAL);
+
+  ceph::BitVector<2> object_map;
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRemoveRequest, ScrubCleanObjects) {
+  REQUIRE_FEATURE(RBD_FEATURE_FAST_DIFF);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1024);
+  for (uint64_t i = 512; i < object_map.size(); ++i) {
+    object_map[i] = i % 2 == 0 ? OBJECT_EXISTS_CLEAN : OBJECT_NONEXISTENT;
+  }
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRemoveRequest(
+    *ictx, &object_map, snap_id, &cond_ctx);
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    request->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  for (uint64_t i = 512; i < object_map.size(); ++i) {
+    ASSERT_EQ(i % 2 == 0 ? OBJECT_EXISTS : OBJECT_NONEXISTENT,
+              object_map[i]);
+  }
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc b/src/test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc
new file mode 100644
index 0000000..25c615b
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_SnapshotRollbackRequest.cc
@@ -0,0 +1,143 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/SnapshotRollbackRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Return;
+
+class TestMockObjectMapSnapshotRollbackRequest : public TestMockFixture {
+public:
+  void expect_read_map(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  read(ObjectMap::object_map_name(ictx->id, snap_id),
+                       0, 0, _)).WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  read(ObjectMap::object_map_name(ictx->id, snap_id),
+                       0, 0, _)).WillOnce(DoDefault());
+    }
+  }
+
+  void expect_write_map(librbd::ImageCtx *ictx, int r) {
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP), _,
+		     "lock", "assert_locked", _, _, _))
+                  .WillOnce(DoDefault());
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  write_full(
+                    ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP), _, _))
+                  .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  write_full(
+                    ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP), _, _))
+                  .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_invalidate(librbd::ImageCtx *ictx, uint32_t times) {
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _))
+                  .Times(0);
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                  .Times(times)
+                  .WillRepeatedly(DoDefault());
+  }
+};
+
+TEST_F(TestMockObjectMapSnapshotRollbackRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_read_map(ictx, snap_id, 0);
+  expect_write_map(ictx, 0);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRollbackRequest(
+    *ictx, snap_id, &cond_ctx);
+  request->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRollbackRequest, ReadMapError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_read_map(ictx, snap_id, -ENOENT);
+  expect_invalidate(ictx, 2);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRollbackRequest(
+    *ictx, snap_id, &cond_ctx);
+  request->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    uint64_t flags;
+    ASSERT_EQ(0, ictx->get_flags(snap_id, &flags));
+    ASSERT_NE(0U, flags & RBD_FLAG_OBJECT_MAP_INVALID);
+  }
+  ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapSnapshotRollbackRequest, WriteMapError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_read_map(ictx, snap_id, 0);
+  expect_write_map(ictx, -EINVAL);
+  expect_invalidate(ictx, 1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *request = new SnapshotRollbackRequest(
+    *ictx, snap_id, &cond_ctx);
+  request->send();
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    uint64_t flags;
+    ASSERT_EQ(0, ictx->get_flags(snap_id, &flags));
+    ASSERT_EQ(0U, flags & RBD_FLAG_OBJECT_MAP_INVALID);
+  }
+  ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_UnlockRequest.cc b/src/test/librbd/object_map/test_mock_UnlockRequest.cc
new file mode 100644
index 0000000..4cd2db4
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_UnlockRequest.cc
@@ -0,0 +1,67 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "cls/lock/cls_lock_ops.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/UnlockRequest.h"
+
+// template definitions
+#include "librbd/object_map/UnlockRequest.cc"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::InSequence;
+using ::testing::Return;
+
+class TestMockObjectMapUnlockRequest : public TestMockFixture {
+public:
+  typedef UnlockRequest<MockImageCtx> MockUnlockRequest;
+
+  void expect_unlock(MockImageCtx &mock_image_ctx, int r) {
+    std::string oid(ObjectMap::object_map_name(mock_image_ctx.id, CEPH_NOSNAP));
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                exec(oid, _, "lock", "unlock", _, _, _))
+                  .WillOnce(Return(r));
+  }
+};
+
+TEST_F(TestMockObjectMapUnlockRequest, Success) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockUnlockRequest *req = new MockUnlockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_unlock(mock_image_ctx, 0);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockObjectMapUnlockRequest, UnlockError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  C_SaferCond ctx;
+  MockUnlockRequest *req = new MockUnlockRequest(mock_image_ctx, &ctx);
+
+  InSequence seq;
+  expect_unlock(mock_image_ctx, -ENOENT);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/object_map/test_mock_UpdateRequest.cc b/src/test/librbd/object_map/test_mock_UpdateRequest.cc
new file mode 100644
index 0000000..bc5cafa
--- /dev/null
+++ b/src/test/librbd/object_map/test_mock_UpdateRequest.cc
@@ -0,0 +1,199 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/object_map/UpdateRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace object_map {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::Return;
+
+class TestMockObjectMapUpdateRequest : public TestMockFixture {
+public:
+  void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+    std::string oid(ObjectMap::object_map_name(ictx->id, snap_id));
+    if (snap_id == CEPH_NOSNAP) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "lock", "assert_locked", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+
+    if (r < 0) {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "rbd", "object_map_update", _, _, _))
+                    .WillOnce(Return(r));
+    } else {
+      EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                  exec(oid, _, "rbd", "object_map_update", _, _, _))
+                    .WillOnce(DoDefault());
+    }
+  }
+
+  void expect_invalidate(librbd::ImageCtx *ictx) {
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "lock", "assert_locked", _, _, _))
+                  .Times(0);
+    EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
+                exec(ictx->header_oid, _, "rbd", "set_flags", _, _, _))
+                  .WillOnce(DoDefault());
+  }
+};
+
+TEST_F(TestMockObjectMapUpdateRequest, UpdateInMemory) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1024);
+  for (uint64_t i = 0; i < object_map.size(); ++i) {
+    object_map[i] = i % 4;
+  }
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest(
+    *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
+    OBJECT_EXISTS, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  for (uint64_t i = 0; i < object_map.size(); ++i) {
+    if (i % 4 == OBJECT_EXISTS || i % 4 == OBJECT_EXISTS_CLEAN) {
+      ASSERT_EQ(OBJECT_NONEXISTENT, object_map[i]);
+    } else {
+      ASSERT_EQ(i % 4, object_map[i]);
+    }
+  }
+}
+
+TEST_F(TestMockObjectMapUpdateRequest, UpdateHeadOnDisk) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  expect_update(ictx, CEPH_NOSNAP, 0);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest(
+    *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
+    OBJECT_EXISTS, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapUpdateRequest, UpdateSnapOnDisk) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, librbd::snap_set(ictx, "snap1"));
+
+  uint64_t snap_id = ictx->snap_id;
+  expect_update(ictx, snap_id, 0);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest(
+    *ictx, &object_map, snap_id, 0, object_map.size(), OBJECT_NONEXISTENT,
+    OBJECT_EXISTS, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapUpdateRequest, UpdateOnDiskError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+  expect_update(ictx, CEPH_NOSNAP, -EINVAL);
+  expect_invalidate(ictx);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest(
+    *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
+    OBJECT_EXISTS, &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  expect_unlock_exclusive_lock(*ictx);
+}
+
+TEST_F(TestMockObjectMapUpdateRequest, RebuildSnapOnDisk) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+  ASSERT_EQ(CEPH_NOSNAP, ictx->snap_id);
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_update(ictx, snap_id, 0);
+  expect_unlock_exclusive_lock(*ictx);
+
+  ceph::BitVector<2> object_map;
+  object_map.resize(1);
+
+  C_SaferCond cond_ctx;
+  AsyncRequest<> *req = new UpdateRequest(
+    *ictx, &object_map, snap_id, 0, object_map.size(), OBJECT_EXISTS_CLEAN,
+    boost::optional<uint8_t>(), &cond_ctx);
+  {
+    RWLock::RLocker snap_locker(ictx->snap_lock);
+    RWLock::WLocker object_map_locker(ictx->object_map_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+
+  // do not update the in-memory map if rebuilding a snapshot
+  ASSERT_NE(OBJECT_EXISTS_CLEAN, object_map[0]);
+}
+
+} // namespace object_map
+} // namespace librbd
diff --git a/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc b/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc
new file mode 100644
index 0000000..d9c8cce
--- /dev/null
+++ b/src/test/librbd/operation/test_mock_SnapshotCreateRequest.cc
@@ -0,0 +1,270 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/operation/SnapshotCreateRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+// template definitions
+#include "librbd/operation/SnapshotCreateRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::DoDefault;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+using ::testing::WithArg;
+
+class TestMockOperationSnapshotCreateRequest : public TestMockFixture {
+public:
+  typedef SnapshotCreateRequest<MockImageCtx> MockSnapshotCreateRequest;
+
+  void expect_block_writes(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, block_writes(_))
+                  .WillRepeatedly(CompleteContext(0, NULL));
+  }
+
+  void expect_verify_lock_ownership(MockImageCtx &mock_image_ctx) {
+    if (mock_image_ctx.exclusive_lock != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.exclusive_lock, is_lock_owner())
+                    .WillRepeatedly(Return(true));
+    }
+  }
+
+  void expect_allocate_snap_id(MockImageCtx &mock_image_ctx, int r) {
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               selfmanaged_snap_create(_));
+    if (r < 0 && r != -ESTALE) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.Times(r < 0 ? 2 : 1).WillRepeatedly(DoDefault());
+    }
+  }
+
+  void expect_release_snap_id(MockImageCtx &mock_image_ctx, int r) {
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               selfmanaged_snap_remove(_));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoDefault());
+    }
+  }
+
+  void expect_snap_create(MockImageCtx &mock_image_ctx, int r) {
+    if (!mock_image_ctx.old_format &&
+         mock_image_ctx.exclusive_lock != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.exclusive_lock, assert_header_locked(_))
+                    .Times(r == -ESTALE ? 2 : 1);
+    }
+
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(mock_image_ctx.header_oid, _, "rbd",
+                               mock_image_ctx.old_format ? "snap_add" :
+                                                           "snapshot_add",
+                               _, _, _));
+    if (r == -ESTALE) {
+      expect.WillOnce(Return(r)).WillOnce(DoDefault());
+    } else if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoDefault());
+    }
+  }
+
+  void expect_object_map_snap_create(MockImageCtx &mock_image_ctx) {
+    if (mock_image_ctx.object_map != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.object_map, snapshot_add(_, _))
+                    .WillOnce(WithArg<1>(CompleteContext(
+                      0, mock_image_ctx.image_ctx->op_work_queue)));
+    }
+  }
+
+  void expect_update_snap_context(MockImageCtx &mock_image_ctx) {
+    // state machine checks to ensure a refresh hasn't already added the snap
+    EXPECT_CALL(mock_image_ctx, get_snap_info(_))
+                  .WillOnce(Return(reinterpret_cast<const librbd::SnapInfo*>(NULL)));
+    EXPECT_CALL(mock_image_ctx, add_snap("snap1", _, _, _, _, _));
+  }
+
+  void expect_unblock_writes(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, unblock_writes())
+                  .Times(1);
+  }
+
+};
+
+TEST_F(TestMockOperationSnapshotCreateRequest, Success) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  expect_allocate_snap_id(mock_image_ctx, 0);
+  expect_snap_create(mock_image_ctx, 0);
+  expect_update_snap_context(mock_image_ctx);
+  expect_object_map_snap_create(mock_image_ctx);
+  expect_unblock_writes(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotCreateRequest *req = new MockSnapshotCreateRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotCreateRequest, AllocateSnapIdError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  expect_allocate_snap_id(mock_image_ctx, -EINVAL);
+  expect_unblock_writes(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotCreateRequest *req = new MockSnapshotCreateRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotCreateRequest, CreateSnapStale) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  expect_block_writes(mock_image_ctx);
+  expect_allocate_snap_id(mock_image_ctx, -ESTALE);
+  expect_snap_create(mock_image_ctx, -ESTALE);
+  expect_update_snap_context(mock_image_ctx);
+  expect_object_map_snap_create(mock_image_ctx);
+  expect_unblock_writes(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotCreateRequest *req = new MockSnapshotCreateRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotCreateRequest, CreateSnapError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  expect_block_writes(mock_image_ctx);
+  expect_allocate_snap_id(mock_image_ctx, 0);
+  expect_snap_create(mock_image_ctx, -EINVAL);
+  expect_release_snap_id(mock_image_ctx, 0);
+  expect_unblock_writes(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotCreateRequest *req = new MockSnapshotCreateRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotCreateRequest, ReleaseSnapIdError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  expect_block_writes(mock_image_ctx);
+  expect_allocate_snap_id(mock_image_ctx, 0);
+  expect_snap_create(mock_image_ctx, -EINVAL);
+  expect_release_snap_id(mock_image_ctx, -ESTALE);
+  expect_unblock_writes(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotCreateRequest *req = new MockSnapshotCreateRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
diff --git a/src/test/librbd/operation/test_mock_SnapshotProtectRequest.cc b/src/test/librbd/operation/test_mock_SnapshotProtectRequest.cc
new file mode 100644
index 0000000..3c64d27
--- /dev/null
+++ b/src/test/librbd/operation/test_mock_SnapshotProtectRequest.cc
@@ -0,0 +1,191 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/operation/SnapshotProtectRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+// template definitions
+#include "librbd/operation/SnapshotProtectRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::DoDefault;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+using ::testing::WithArg;
+
+class TestMockOperationSnapshotProtectRequest : public TestMockFixture {
+public:
+  typedef SnapshotProtectRequest<MockImageCtx> MockSnapshotProtectRequest;
+
+  void expect_get_snap_id(MockImageCtx &mock_image_ctx, uint64_t snap_id) {
+    EXPECT_CALL(mock_image_ctx, get_snap_id(_))
+                  .WillOnce(Return(snap_id));
+  }
+
+  void expect_is_snap_protected(MockImageCtx &mock_image_ctx, bool is_protected,
+                                int r) {
+    auto &expect = EXPECT_CALL(mock_image_ctx, is_snap_protected(_, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoAll(SetArgPointee<1>(is_protected), Return(0)));
+    }
+  }
+
+  void expect_set_protection_status(MockImageCtx &mock_image_ctx, int r) {
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(mock_image_ctx.header_oid, _, "rbd",
+                                    "set_protection_status", _, _, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoDefault());
+    }
+  }
+};
+
+TEST_F(TestMockOperationSnapshotProtectRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, ictx->snap_info.rbegin()->first);
+  expect_is_snap_protected(mock_image_ctx, false, 0);
+  expect_set_protection_status(mock_image_ctx, 0);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotProtectRequest *req = new MockSnapshotProtectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotProtectRequest, GetSnapIdMissing) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, CEPH_NOSNAP);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotProtectRequest *req = new MockSnapshotProtectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-ENOENT, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotProtectRequest, IsSnapProtectedError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, ictx->snap_info.rbegin()->first);
+  expect_is_snap_protected(mock_image_ctx, false, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotProtectRequest *req = new MockSnapshotProtectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotProtectRequest, SnapAlreadyProtected) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, ictx->snap_info.rbegin()->first);
+  expect_is_snap_protected(mock_image_ctx, true, 0);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotProtectRequest *req = new MockSnapshotProtectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EBUSY, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotProtectRequest, SetProtectionStateError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, ictx->snap_info.rbegin()->first);
+  expect_is_snap_protected(mock_image_ctx, false, 0);
+  expect_set_protection_status(mock_image_ctx, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotProtectRequest *req = new MockSnapshotProtectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
diff --git a/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc b/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
new file mode 100644
index 0000000..122f4bf
--- /dev/null
+++ b/src/test/librbd/operation/test_mock_SnapshotRemoveRequest.cc
@@ -0,0 +1,359 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/operation/SnapshotRemoveRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+// template definitions
+#include "librbd/operation/SnapshotRemoveRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::DoDefault;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+using ::testing::WithArg;
+
+class TestMockOperationSnapshotRemoveRequest : public TestMockFixture {
+public:
+  typedef SnapshotRemoveRequest<MockImageCtx> MockSnapshotRemoveRequest;
+
+  int create_snapshot(const char *snap_name) {
+    librbd::ImageCtx *ictx;
+    int r = open_image(m_image_name, &ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    r = librbd::snap_create(ictx, snap_name);
+    if (r < 0) {
+      return r;
+    }
+
+    r = librbd::snap_protect(ictx, snap_name);
+     if (r < 0) {
+      return r;
+    }
+    close_image(ictx);
+    return 0;
+  }
+
+  void expect_object_map_snap_remove(MockImageCtx &mock_image_ctx, int r) {
+    if (mock_image_ctx.object_map != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.object_map, snapshot_remove(_, _))
+                    .WillOnce(WithArg<1>(CompleteContext(
+                      r, mock_image_ctx.image_ctx->op_work_queue)));
+    }
+  }
+
+  void expect_get_parent_spec(MockImageCtx &mock_image_ctx, int r) {
+    auto &expect = EXPECT_CALL(mock_image_ctx, get_parent_spec(_, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      parent_spec &parent_spec = mock_image_ctx.snap_info.rbegin()->second.parent.spec;
+      expect.WillOnce(DoAll(SetArgPointee<1>(parent_spec),
+                            Return(0)));
+    }
+  }
+
+  void expect_remove_child(MockImageCtx &mock_image_ctx, int r) {
+    bool deep_flatten = mock_image_ctx.image_ctx->test_features(RBD_FEATURE_DEEP_FLATTEN);
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(RBD_CHILDREN, _, "rbd", "remove_child",_,
+                                    _, _));
+    if (deep_flatten) {
+      expect.Times(0);
+    } else {
+      expect.WillOnce(Return(r));
+    }
+  }
+
+  void expect_verify_lock_ownership(MockImageCtx &mock_image_ctx) {
+    if (mock_image_ctx.old_format) {
+      return;
+    }
+
+    if (mock_image_ctx.exclusive_lock != nullptr) {
+      EXPECT_CALL(*mock_image_ctx.exclusive_lock, is_lock_owner())
+                    .WillRepeatedly(Return(false));
+    }
+  }
+
+  void expect_snap_remove(MockImageCtx &mock_image_ctx, int r) {
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(mock_image_ctx.header_oid, _, "rbd",
+                               mock_image_ctx.old_format ? "snap_remove" :
+                                                           "snapshot_remove",
+                                _, _, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoDefault());
+    }
+  }
+
+  void expect_rm_snap(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(mock_image_ctx, rm_snap(_, _)).Times(1);
+  }
+
+  void expect_release_snap_id(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                                selfmanaged_snap_remove(_))
+                                  .WillOnce(DoDefault());
+  }
+
+};
+
+TEST_F(TestMockOperationSnapshotRemoveRequest, Success) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_object_map_snap_remove(mock_image_ctx, 0);
+  expect_get_parent_spec(mock_image_ctx, 0);
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_snap_remove(mock_image_ctx, 0);
+  expect_rm_snap(mock_image_ctx);
+  expect_release_snap_id(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotRemoveRequest *req = new MockSnapshotRemoveRequest(
+    mock_image_ctx, &cond_ctx, "snap1", snap_id);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotRemoveRequest, FlattenedCloneRemovesChild) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  ASSERT_EQ(0, create_snapshot("snap1"));
+
+  int order = 22;
+  uint64_t features;
+  ASSERT_TRUE(::get_features(&features));
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(clone_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+
+  librbd::NoOpProgressContext prog_ctx;
+  ASSERT_EQ(0, librbd::flatten(ictx, prog_ctx));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_op_work_queue(mock_image_ctx);
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_object_map_snap_remove(mock_image_ctx, 0);
+  expect_get_parent_spec(mock_image_ctx, 0);
+  expect_remove_child(mock_image_ctx, -ENOENT);
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_snap_remove(mock_image_ctx, 0);
+  expect_rm_snap(mock_image_ctx);
+  expect_release_snap_id(mock_image_ctx);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotRemoveRequest *req = new MockSnapshotRemoveRequest(
+    mock_image_ctx, &cond_ctx, "snap1", snap_id);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotRemoveRequest, ObjectMapSnapRemoveError) {
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_object_map_snap_remove(mock_image_ctx, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotRemoveRequest *req = new MockSnapshotRemoveRequest(
+    mock_image_ctx, &cond_ctx, "snap1", snap_id);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotRemoveRequest, RemoveChildParentError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_object_map_snap_remove(mock_image_ctx, 0);
+  expect_get_parent_spec(mock_image_ctx, -ENOENT);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotRemoveRequest *req = new MockSnapshotRemoveRequest(
+    mock_image_ctx, &cond_ctx, "snap1", snap_id);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-ENOENT, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotRemoveRequest, RemoveChildError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  ASSERT_EQ(0, create_snapshot("snap1"));
+
+  int order = 22;
+  uint64_t features;
+  ASSERT_TRUE(::get_features(&features));
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(clone_name, &ictx));
+  if (ictx->test_features(RBD_FEATURE_DEEP_FLATTEN)) {
+    std::cout << "SKIPPING" << std::endl;
+    return SUCCEED();
+  }
+
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+
+  librbd::NoOpProgressContext prog_ctx;
+  ASSERT_EQ(0, librbd::flatten(ictx, prog_ctx));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_op_work_queue(mock_image_ctx);
+
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_object_map_snap_remove(mock_image_ctx, 0);
+  expect_get_parent_spec(mock_image_ctx, 0);
+  expect_remove_child(mock_image_ctx, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotRemoveRequest *req = new MockSnapshotRemoveRequest(
+    mock_image_ctx, &cond_ctx, "snap1", snap_id);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotRemoveRequest, RemoveSnapError) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  MockExclusiveLock mock_exclusive_lock;
+  if (ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+    mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+  }
+
+  MockObjectMap mock_object_map;
+  if (ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+    mock_image_ctx.object_map = &mock_object_map;
+  }
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_object_map_snap_remove(mock_image_ctx, 0);
+  expect_get_parent_spec(mock_image_ctx, 0);
+  expect_verify_lock_ownership(mock_image_ctx);
+  expect_snap_remove(mock_image_ctx, -ENOENT);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotRemoveRequest *req = new MockSnapshotRemoveRequest(
+    mock_image_ctx, &cond_ctx, "snap1", snap_id);
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-ENOENT, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
diff --git a/src/test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc b/src/test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
new file mode 100644
index 0000000..07cb296
--- /dev/null
+++ b/src/test/librbd/operation/test_mock_SnapshotUnprotectRequest.cc
@@ -0,0 +1,276 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "test/librados_test_stub/MockTestMemRadosClient.h"
+#include "include/rados/librados.hpp"
+#include "common/bit_vector.hpp"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/operation/SnapshotUnprotectRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+// template definitions
+#include "librbd/operation/SnapshotUnprotectRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::DoDefault;
+using ::testing::Return;
+using ::testing::SetArgReferee;
+using ::testing::SetArgPointee;
+using ::testing::WithArg;
+
+class TestMockOperationSnapshotUnprotectRequest : public TestMockFixture {
+public:
+  typedef SnapshotUnprotectRequest<MockImageCtx> MockSnapshotUnprotectRequest;
+
+  void expect_get_snap_id(MockImageCtx &mock_image_ctx, uint64_t snap_id) {
+    EXPECT_CALL(mock_image_ctx, get_snap_id(_))
+                  .WillOnce(Return(snap_id));
+  }
+
+  void expect_is_snap_unprotected(MockImageCtx &mock_image_ctx,
+                                  bool is_unprotected, int r) {
+    auto &expect = EXPECT_CALL(mock_image_ctx, is_snap_unprotected(_, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoAll(SetArgPointee<1>(is_unprotected), Return(0)));
+    }
+  }
+
+  void expect_set_protection_status(MockImageCtx &mock_image_ctx,
+                                    uint64_t snap_id, uint8_t status, int r) {
+    bufferlist bl;
+    ::encode(snap_id, bl);
+    ::encode(status, bl);
+
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(mock_image_ctx.header_oid, _, "rbd",
+                                    "set_protection_status", ContentsEqual(bl),
+                                    _, _));
+    if (r < 0) {
+      expect.WillOnce(Return(r));
+    } else {
+      expect.WillOnce(DoDefault());
+    }
+  }
+
+  size_t expect_create_pool_io_contexts(MockImageCtx &mock_image_ctx) {
+    librados::MockTestMemIoCtxImpl &io_ctx_impl =
+      get_mock_io_ctx(mock_image_ctx.md_ctx);
+    librados::MockTestMemRadosClient *rados_client =
+      io_ctx_impl.get_mock_rados_client();
+
+    std::list<std::pair<int64_t, std::string> > pools;
+    int r = rados_client->pool_list(pools);
+    if (r < 0) {
+      ADD_FAILURE() << "failed to list pools";
+      return 0;
+    }
+
+    EXPECT_CALL(*rados_client, create_ioctx(_, _))
+                  .Times(pools.size()).WillRepeatedly(DoAll(
+                    GetReference(&io_ctx_impl), Return(&io_ctx_impl)));
+    return pools.size();
+  }
+
+  void expect_get_children(MockImageCtx &mock_image_ctx, size_t pools, int r) {
+    bufferlist bl;
+    std::set<std::string> children;
+    ::encode(children, bl);
+
+    auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+                               exec(RBD_CHILDREN, _, "rbd", "get_children", _,
+                               _, _));
+    if (r < 0) {
+      expect.WillRepeatedly(Return(r));
+    } else {
+      expect.Times(pools).WillRepeatedly(DoAll(
+        SetArgPointee<5>(bl), Return(0)));
+    }
+  }
+};
+
+TEST_F(TestMockOperationSnapshotUnprotectRequest, Success) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_get_snap_id(mock_image_ctx, snap_id);
+  expect_is_snap_unprotected(mock_image_ctx, false, 0);
+  expect_set_protection_status(mock_image_ctx, snap_id,
+                               RBD_PROTECTION_STATUS_UNPROTECTING, 0);
+  size_t pools = expect_create_pool_io_contexts(mock_image_ctx);
+  expect_get_children(mock_image_ctx, pools, -ENOENT);
+  expect_set_protection_status(mock_image_ctx, snap_id,
+                               RBD_PROTECTION_STATUS_UNPROTECTED, 0);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotUnprotectRequest *req = new MockSnapshotUnprotectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotUnprotectRequest, GetSnapIdMissing) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, CEPH_NOSNAP);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotUnprotectRequest *req = new MockSnapshotUnprotectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-ENOENT, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotUnprotectRequest, IsSnapUnprotectedError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, ictx->snap_info.rbegin()->first);
+  expect_is_snap_unprotected(mock_image_ctx, false, -EBADMSG);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotUnprotectRequest *req = new MockSnapshotUnprotectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EBADMSG, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotUnprotectRequest, SnapAlreadyUnprotected) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  expect_get_snap_id(mock_image_ctx, ictx->snap_info.rbegin()->first);
+  expect_is_snap_unprotected(mock_image_ctx, true, 0);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotUnprotectRequest *req = new MockSnapshotUnprotectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotUnprotectRequest, SetProtectionStatusError) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_get_snap_id(mock_image_ctx, snap_id);
+  expect_is_snap_unprotected(mock_image_ctx, false, 0);
+  expect_set_protection_status(mock_image_ctx, snap_id,
+                               RBD_PROTECTION_STATUS_UNPROTECTING, -EINVAL);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotUnprotectRequest *req = new MockSnapshotUnprotectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationSnapshotUnprotectRequest, ChildrenExist) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, ictx->state->refresh_if_required());
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  expect_op_work_queue(mock_image_ctx);
+
+  ::testing::InSequence seq;
+  uint64_t snap_id = ictx->snap_info.rbegin()->first;
+  expect_get_snap_id(mock_image_ctx, snap_id);
+  expect_is_snap_unprotected(mock_image_ctx, false, 0);
+  expect_set_protection_status(mock_image_ctx, snap_id,
+                               RBD_PROTECTION_STATUS_UNPROTECTING, 0);
+  size_t pools = expect_create_pool_io_contexts(mock_image_ctx);
+  expect_get_children(mock_image_ctx, pools, 0);
+  expect_set_protection_status(mock_image_ctx, snap_id,
+                               RBD_PROTECTION_STATUS_PROTECTED, 0);
+
+  C_SaferCond cond_ctx;
+  MockSnapshotUnprotectRequest *req = new MockSnapshotUnprotectRequest(
+    mock_image_ctx, &cond_ctx, "snap1");
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    req->send();
+  }
+  ASSERT_EQ(-EBUSY, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
diff --git a/src/test/librbd/test_ImageWatcher.cc b/src/test/librbd/test_ImageWatcher.cc
index 0f137fe..c816853 100644
--- a/src/test/librbd/test_ImageWatcher.cc
+++ b/src/test/librbd/test_ImageWatcher.cc
@@ -33,7 +33,7 @@
 
 using namespace ceph;
 using namespace boost::assign;
-using namespace librbd::WatchNotify;
+using namespace librbd::watch_notify;
 
 void register_test_image_watcher() {
 }
@@ -45,44 +45,6 @@ public:
   {
   }
 
-  struct LockListener : public librbd::ImageWatcher::Listener {
-    Mutex lock;
-    Cond cond;
-    size_t releasing_lock_count;
-    size_t lock_updated_count;
-    bool lock_owner;
-
-    LockListener()
-      : lock("lock"), releasing_lock_count(0), lock_updated_count(0),
-        lock_owner(false) {
-    }
-
-    virtual bool handle_requested_lock() {
-      return true;
-    }
-    virtual void handle_lock_updated(
-        librbd::ImageWatcher::LockUpdateState state) {
-      Mutex::Locker locker(lock);
-      ++lock_updated_count;
-
-      switch (state) {
-      case librbd::ImageWatcher::LOCK_UPDATE_STATE_NOT_SUPPORTED:
-      case librbd::ImageWatcher::LOCK_UPDATE_STATE_UNLOCKED:
-      case librbd::ImageWatcher::LOCK_UPDATE_STATE_NOTIFICATION:
-        lock_owner = false;
-        break;
-      case librbd::ImageWatcher::LOCK_UPDATE_STATE_RELEASING:
-        lock_owner = false;
-        ++releasing_lock_count;
-        break;
-      case librbd::ImageWatcher::LOCK_UPDATE_STATE_LOCKED:
-        lock_owner = true;
-        break;
-      }
-      cond.Signal();
-    }
-  };
-
   class WatchCtx : public librados::WatchCtx2 {
   public:
     WatchCtx(TestImageWatcher &parent) : m_parent(parent), m_handle(0) {}
@@ -165,39 +127,11 @@ public:
     return 0;
   }
 
-  void register_lock_listener(librbd::ImageCtx &ictx) {
-    ictx.image_watcher->register_listener(&m_lock_listener);
-  }
-
   int register_image_watch(librbd::ImageCtx &ictx) {
     m_watch_ctx = new WatchCtx(*this);
     return m_watch_ctx->watch(ictx);
   }
 
-  bool wait_for_releasing_lock(librbd::ImageCtx &ictx) {
-    Mutex::Locker locker(m_lock_listener.lock);
-    while (m_lock_listener.releasing_lock_count == 0) {
-      if (m_lock_listener.cond.WaitInterval(ictx.cct, m_lock_listener.lock,
-                                            utime_t(10, 0)) != 0) {
-        return false;
-      }
-    }
-    m_lock_listener.releasing_lock_count = 0;
-    return true;
-  }
-
-  bool wait_for_lock_updated(librbd::ImageCtx &ictx) {
-    Mutex::Locker locker(m_lock_listener.lock);
-    while (m_lock_listener.lock_updated_count == 0) {
-      if (m_lock_listener.cond.WaitInterval(ictx.cct, m_lock_listener.lock,
-                                            utime_t(10, 0)) != 0) {
-        return false;
-      }
-    }
-    m_lock_listener.lock_updated_count = 0;
-    return true;
-  }
-
   bool wait_for_notifies(librbd::ImageCtx &ictx) {
     Mutex::Locker l(m_callback_lock);
     while (m_notifies.size() < m_notify_acks.size()) {
@@ -271,8 +205,6 @@ public:
 
   WatchCtx *m_watch_ctx;
 
-  LockListener m_lock_listener;
-
   NotifyOps m_notifies;
   NotifyOpPayloads m_notify_payloads;
   NotifyOpPayloads m_notify_acks;
@@ -357,487 +289,58 @@ struct RebuildObjectMapTask {
   }
 };
 
-TEST_F(TestImageWatcher, IsLockSupported) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_TRUE(ictx->image_watcher);
-  ASSERT_TRUE(ictx->image_watcher->is_lock_supported());
-
-  ictx->read_only = true;
-  ASSERT_FALSE(ictx->image_watcher->is_lock_supported());
-  ictx->read_only = false;
-
-  ictx->features &= ~RBD_FEATURE_EXCLUSIVE_LOCK;
-  ASSERT_FALSE(ictx->image_watcher->is_lock_supported());
-  ictx->features |= RBD_FEATURE_EXCLUSIVE_LOCK;
-
-  ictx->snap_id = 1234;
-  ASSERT_FALSE(ictx->image_watcher->is_lock_supported());
-  ictx->snap_id = CEPH_NOSNAP;
-}
-
-TEST_F(TestImageWatcher, TryLock) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_TRUE(ictx->image_watcher);
-
-  {
-    RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-
-  std::map<rados::cls::lock::locker_id_t,
-           rados::cls::lock::locker_info_t> lockers;
-  ClsLockType lock_type;
-  ASSERT_EQ(0, rados::cls::lock::get_lock_info(&m_ioctx, ictx->header_oid,
-					       RBD_LOCK_NAME, &lockers,
-					       &lock_type, NULL));
-  ASSERT_EQ(LOCK_EXCLUSIVE, lock_type);
-  ASSERT_EQ(1U, lockers.size());
-}
-
-TEST_F(TestImageWatcher, TryLockNotifyAnnounceLocked) {
+TEST_F(TestImageWatcher, NotifyRequestLock) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
   ASSERT_EQ(0, register_image_watch(*ictx));
-  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
 
-  {
-    RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
-  }
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, {}}};
+  ictx->image_watcher->notify_request_lock();
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
 
   NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_ACQUIRED_LOCK;
+  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
 }
 
-TEST_F(TestImageWatcher, TryLockWithTimedOutOwner) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  // use new Rados connection due to blacklisting
-  librados::Rados rados;
-  ASSERT_EQ("", connect_cluster_pp(rados));
-
-  librados::IoCtx io_ctx;
-  ASSERT_EQ(0, rados.ioctx_create(_pool_name.c_str(), io_ctx));
-  librbd::ImageCtx *ictx = new librbd::ImageCtx(m_image_name.c_str(), "", NULL,
-					        io_ctx, false);
-  ASSERT_EQ(0, librbd::open_image(ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE, "auto 1234"));
-  librbd::close_image(ictx);
-  io_ctx.close();
-
-  // no watcher on the locked image means we can break the lock
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_EQ(0, ictx->image_watcher->try_lock());
-  ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-
-  rados.test_blacklist_self(false);
-}
-
-TEST_F(TestImageWatcher, TryLockWithUserExclusiveLock) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE, "manually locked"));
-
-  RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_EQ(-EBUSY, ictx->image_watcher->try_lock());
-  ASSERT_FALSE(ictx->image_watcher->is_lock_owner());
-
-  ASSERT_EQ(0, unlock_image());
-  ASSERT_EQ(0, ictx->image_watcher->try_lock());
-  ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-}
-
-TEST_F(TestImageWatcher, TryLockWithUserSharedLocked) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually locked"));
-
-  RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_EQ(-EBUSY, ictx->image_watcher->try_lock());
-  ASSERT_FALSE(ictx->image_watcher->is_lock_owner());
-
-  ASSERT_EQ(0, unlock_image());
-  ASSERT_EQ(0, ictx->image_watcher->try_lock());
-  ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-}
-
-TEST_F(TestImageWatcher, ReleaseLockNotLocked) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-
-  RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_EQ(0, ictx->image_watcher->release_lock());
-}
-
-TEST_F(TestImageWatcher, ReleaseLockNotifies) {
+TEST_F(TestImageWatcher, NotifyReleasedLock) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
   ASSERT_EQ(0, register_image_watch(*ictx));
-  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
 
-  {
-    RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
-  }
-  ASSERT_TRUE(wait_for_notifies(*ictx));
+  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
+  ictx->image_watcher->notify_released_lock();
 
-  m_notify_acks += std::make_pair(NOTIFY_OP_RELEASED_LOCK, bufferlist());
-  {
-    RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->release_lock());
-  }
   ASSERT_TRUE(wait_for_notifies(*ictx));
 
   NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_ACQUIRED_LOCK, NOTIFY_OP_RELEASED_LOCK;
+  expected_notify_ops += NOTIFY_OP_RELEASED_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
 }
 
-TEST_F(TestImageWatcher, ReleaseLockBrokenLock) {
+TEST_F(TestImageWatcher, NotifyAcquiredLock) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
-  RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_EQ(0, ictx->image_watcher->try_lock());
-
-  std::map<rados::cls::lock::locker_id_t,
-           rados::cls::lock::locker_info_t> lockers;
-  ClsLockType lock_type;
-  ASSERT_EQ(0, rados::cls::lock::get_lock_info(&m_ioctx, ictx->header_oid,
-                                               RBD_LOCK_NAME, &lockers,
-                                               &lock_type, NULL));
-  ASSERT_EQ(1U, lockers.size());
-  ASSERT_EQ(0, rados::cls::lock::break_lock(&m_ioctx, ictx->header_oid,
-					    RBD_LOCK_NAME,
-					    lockers.begin()->first.cookie,
-					    lockers.begin()->first.locker));
-
-  ASSERT_EQ(0, ictx->image_watcher->release_lock());
-}
-
-TEST_F(TestImageWatcher, RequestLock) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, register_image_watch(*ictx));
 
-  register_lock_listener(*ictx);
   m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
+  ictx->image_watcher->notify_acquired_lock();
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
-  NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_ACQUIRED_LOCK;
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-}
-
-TEST_F(TestImageWatcher, RequestLockFromPeer) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, register_image_watch(*ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
-			  "auto " + stringify(m_watch_ctx->get_handle())));
-
-  register_lock_listener(*ictx);
-  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
 
-  ASSERT_TRUE(wait_for_notifies(*ictx));
   NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  ASSERT_EQ(0, unlock_image());
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK,{}}};
-  }
-
-  bufferlist bl;
-  {
-    ENCODE_START(1, 1, bl);
-    ::encode(NOTIFY_OP_RELEASED_LOCK, bl);
-    ENCODE_FINISH(bl);
-  }
-  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
-  ASSERT_TRUE(wait_for_lock_updated(*ictx));
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-  }
-
-  {
-    RWLock::RLocker owner_lock(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  expected_notify_ops.clear();
   expected_notify_ops += NOTIFY_OP_ACQUIRED_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-}
-
-TEST_F(TestImageWatcher, RequestLockTimedOut) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, register_image_watch(*ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
-			  "auto " + stringify(m_watch_ctx->get_handle())));
-
-  register_lock_listener(*ictx);
-  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, {}}};
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  // should resend when empty ack returned
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-  }
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    ASSERT_EQ(0, unlock_image());
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  ASSERT_TRUE(wait_for_lock_updated(*ictx));
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-}
-
-TEST_F(TestImageWatcher, RequestLockIgnored) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, register_image_watch(*ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
-			  "auto " + stringify(m_watch_ctx->get_handle())));
-
-  register_lock_listener(*ictx);
-  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
-
-  int orig_notify_timeout = ictx->cct->_conf->client_notify_timeout;
-  ictx->cct->_conf->set_val("client_notify_timeout", "0");
-  BOOST_SCOPE_EXIT( (ictx)(orig_notify_timeout) ) {
-    ictx->cct->_conf->set_val("client_notify_timeout",
-                              stringify(orig_notify_timeout));
-  } BOOST_SCOPE_EXIT_END;
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  // after the request times out -- it will be resent
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-  }
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    ASSERT_EQ(0, unlock_image());
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  ASSERT_TRUE(wait_for_lock_updated(*ictx));
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-}
-
-TEST_F(TestImageWatcher, RequestLockTryLockRace) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, register_image_watch(*ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
-                          "auto " + stringify(m_watch_ctx->get_handle())));
-
-  register_lock_listener(*ictx);
-  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
-  }
-
-  bufferlist bl;
-  {
-    ENCODE_START(1, 1, bl);
-    ::encode(NOTIFY_OP_RELEASED_LOCK, bl);
-    ENCODE_FINISH(bl);
-  }
-  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
-
-  // after losing race -- it will re-request
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_FALSE(ictx->image_watcher->is_lock_owner());
-  }
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    ASSERT_EQ(0, unlock_image());
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
-  }
-
-  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
-  ASSERT_TRUE(wait_for_lock_updated(*ictx));
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-  }
-
-  {
-    RWLock::RLocker owner_lock(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_lock_updated(*ictx));
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-}
-
-TEST_F(TestImageWatcher, RequestLockTryLockFailed) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, register_image_watch(*ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
-
-  register_lock_listener(*ictx);
-  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, {}}};
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-  NotifyOps expected_notify_ops;
-  expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
-  ASSERT_EQ(expected_notify_ops, m_notifies);
-
-  // should resend when error encountered
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-  }
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    ASSERT_EQ(0, unlock_image());
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
 }
 
 TEST_F(TestImageWatcher, NotifyHeaderUpdate) {
@@ -1061,6 +564,66 @@ TEST_F(TestImageWatcher, NotifySnapRemove) {
   ASSERT_EQ(expected_notify_ops, m_notifies);
 }
 
+TEST_F(TestImageWatcher, NotifySnapProtect) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_SNAP_PROTECT, create_response_message(0)}};
+
+  RWLock::RLocker l(ictx->owner_lock);
+  ASSERT_EQ(0, ictx->image_watcher->notify_snap_protect("snap"));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_SNAP_PROTECT;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+}
+
+TEST_F(TestImageWatcher, NotifySnapUnprotect) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_SNAP_UNPROTECT, create_response_message(0)}};
+
+  RWLock::RLocker l(ictx->owner_lock);
+  ASSERT_EQ(0, ictx->image_watcher->notify_snap_unprotect("snap"));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_SNAP_UNPROTECT;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+}
+
+TEST_F(TestImageWatcher, NotifyRename) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_RENAME, create_response_message(0)}};
+
+  RWLock::RLocker l(ictx->owner_lock);
+  ASSERT_EQ(0, ictx->image_watcher->notify_rename("new_name"));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_RENAME;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+}
+
 TEST_F(TestImageWatcher, NotifyAsyncTimedOut) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -1156,47 +719,3 @@ TEST_F(TestImageWatcher, NotifyAsyncRequestTimedOut) {
   ASSERT_EQ(-ERESTART, flatten_task.result);
 }
 
-TEST_F(TestImageWatcher, PeerRequestsLock) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, register_image_watch(*ictx));
-
-  register_lock_listener(*ictx);
-  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
-  }
-
-  // if journaling is enabled, ensure we wait for it to replay since
-  // it will block our peer request
-  std::string buffer(256, '1');
-  ictx->aio_work_queue->write(0, buffer.size(), buffer.c_str(), 0);
-
-  {
-    Mutex::Locker l(m_callback_lock);
-    m_notifies.clear();
-    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
-  }
-
-  bufferlist bl;
-  {
-    ENCODE_START(1, 1, bl);
-    ::encode(NOTIFY_OP_REQUEST_LOCK, bl);
-    ENCODE_FINISH(bl);
-  }
-  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
-
-  ASSERT_TRUE(wait_for_releasing_lock(*ictx));
-  ASSERT_TRUE(wait_for_notifies(*ictx));
-}
diff --git a/src/test/librbd/test_JournalEntries.cc b/src/test/librbd/test_JournalEntries.cc
index 7dea547..1b8e082 100644
--- a/src/test/librbd/test_JournalEntries.cc
+++ b/src/test/librbd/test_JournalEntries.cc
@@ -125,8 +125,7 @@ TEST_F(TestJournalEntries, AioWrite) {
 
   std::string buffer(512, '1');
   C_SaferCond cond_ctx;
-  librbd::AioCompletion *c =
-    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  librbd::AioCompletion *c = librbd::AioCompletion::create(&cond_ctx);
   c->get();
   ictx->aio_work_queue->aio_write(c, 123, buffer.size(), buffer.c_str(), 0);
   ASSERT_EQ(0, c->wait_for_complete());
@@ -163,8 +162,7 @@ TEST_F(TestJournalEntries, AioDiscard) {
   ASSERT_TRUE(journaler != NULL);
 
   C_SaferCond cond_ctx;
-  librbd::AioCompletion *c =
-    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  librbd::AioCompletion *c = librbd::AioCompletion::create(&cond_ctx);
   c->get();
   ictx->aio_work_queue->aio_discard(c, 123, 234);
   ASSERT_EQ(0, c->wait_for_complete());
@@ -197,8 +195,7 @@ TEST_F(TestJournalEntries, AioFlush) {
   ASSERT_TRUE(journaler != NULL);
 
   C_SaferCond cond_ctx;
-  librbd::AioCompletion *c =
-    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  librbd::AioCompletion *c = librbd::AioCompletion::create(&cond_ctx);
   c->get();
   ictx->aio_work_queue->aio_flush(c);
   ASSERT_EQ(0, c->wait_for_complete());
diff --git a/src/test/librbd/test_JournalReplay.cc b/src/test/librbd/test_JournalReplay.cc
index c6d6347..c5a6ad6 100644
--- a/src/test/librbd/test_JournalReplay.cc
+++ b/src/test/librbd/test_JournalReplay.cc
@@ -6,6 +6,7 @@
 #include "librbd/AioCompletion.h"
 #include "librbd/AioImageRequest.h"
 #include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/Journal.h"
@@ -17,37 +18,13 @@ void register_test_journal_replay() {
 class TestJournalReplay : public TestFixture {
 public:
 
-  struct Listener : public librbd::ImageWatcher::Listener{
-    Mutex lock;
-    Cond cond;
-
-    Listener() : lock("TestJournalReplay::Listener::lock") {
-    }
-    virtual bool handle_requested_lock() {
-      return true;
-    }
-    virtual void handle_releasing_lock() {
-    }
-    virtual void handle_lock_updated(
-        librbd::ImageWatcher::LockUpdateState state) {
-      Mutex::Locker locker(lock);
-      cond.Signal();
-    }
-  };
-
-  void wait_for_lock_owner(librbd::ImageCtx *ictx) {
-    Listener listener;
-    ictx->image_watcher->register_listener(&listener);
+  int when_acquired_lock(librbd::ImageCtx *ictx) {
+    C_SaferCond lock_ctx;
     {
-      Mutex::Locker listener_locker(listener.lock);
-      RWLock::RLocker owner_locker(ictx->owner_lock);
-      while (!ictx->image_watcher->is_lock_owner()) {
-        ictx->owner_lock.put_read();
-        listener.cond.Wait(listener.lock);
-        ictx->owner_lock.get_read();
-      }
+      RWLock::WLocker owner_locker(ictx->owner_lock);
+      ictx->exclusive_lock->request_lock(&lock_ctx);
     }
-    ictx->image_watcher->unregister_listener(&listener);
+    return lock_ctx.wait();
   }
 };
 
@@ -58,7 +35,6 @@ TEST_F(TestJournalReplay, AioDiscardEvent) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ictx->features &= ~RBD_FEATURE_JOURNALING;
-  ASSERT_EQ(0, ictx->close_journal(true));
 
   std::string payload(4096, '1');
   librbd::AioCompletion *aio_comp = new librbd::AioCompletion();
@@ -83,27 +59,19 @@ TEST_F(TestJournalReplay, AioDiscardEvent) {
 
   // inject a discard operation into the journal
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  {
-    RWLock::WLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-  wait_for_lock_owner(ictx);
-
-  ictx->journal->open();
-  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+  ASSERT_EQ(0, when_acquired_lock(ictx));
 
   librbd::journal::EventEntry event_entry(
     librbd::journal::AioDiscardEvent(0, payload.size()));
   librbd::Journal::AioObjectRequests requests;
   {
     RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->journal->append_event(NULL, event_entry, requests, 0, 0, true);
+    ictx->journal->append_io_event(NULL, event_entry, requests, 0, 0, true);
   }
-  ASSERT_EQ(0, ictx->journal->close());
 
   // re-open the journal so that it replays the new entry
-  ictx->journal->open();
-  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, when_acquired_lock(ictx));
 
   aio_comp = new librbd::AioCompletion();
   ictx->aio_work_queue->aio_read(aio_comp, 0, read_payload.size(),
@@ -119,14 +87,7 @@ TEST_F(TestJournalReplay, AioWriteEvent) {
   // inject a write operation into the journal
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  {
-    RWLock::WLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-  wait_for_lock_owner(ictx);
-
-  ictx->journal->open();
-  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+  ASSERT_EQ(0, when_acquired_lock(ictx));
 
   std::string payload(4096, '1');
   bufferlist payload_bl;
@@ -136,13 +97,12 @@ TEST_F(TestJournalReplay, AioWriteEvent) {
   librbd::Journal::AioObjectRequests requests;
   {
     RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->journal->append_event(NULL, event_entry, requests, 0, 0, true);
+    ictx->journal->append_io_event(NULL, event_entry, requests, 0, 0, true);
   }
-  ASSERT_EQ(0, ictx->journal->close());
 
   // re-open the journal so that it replays the new entry
-  ictx->journal->open();
-  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, when_acquired_lock(ictx));
 
   std::string read_payload(4096, '\0');
   librbd::AioCompletion *aio_comp = new librbd::AioCompletion();
@@ -158,24 +118,17 @@ TEST_F(TestJournalReplay, AioFlushEvent) {
 
   // inject a flush operation into the journal
   librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  {
-    RWLock::WLocker owner_locker(ictx->owner_lock);
-    ictx->image_watcher->request_lock();
-  }
-  wait_for_lock_owner(ictx);
 
-  ictx->journal->open();
-  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, when_acquired_lock(ictx));
 
   librbd::journal::AioFlushEvent aio_flush_event;
   librbd::journal::EventEntry event_entry(aio_flush_event);
   librbd::Journal::AioObjectRequests requests;
   {
     RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->journal->append_event(NULL, event_entry, requests, 0, 0, true);
+    ictx->journal->append_io_event(NULL, event_entry, requests, 0, 0, true);
   }
-  ASSERT_EQ(0, ictx->journal->close());
 
   // start an AIO write op
   librbd::Journal *journal = ictx->journal;
@@ -191,8 +144,8 @@ TEST_F(TestJournalReplay, AioFlushEvent) {
   ictx->journal = journal;
 
   // re-open the journal so that it replays the new entry
-  ictx->journal->open();
-  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, when_acquired_lock(ictx));
 
   ASSERT_TRUE(aio_comp->is_complete());
   ASSERT_EQ(0, aio_comp->wait_for_complete());
diff --git a/src/test/librbd/test_ObjectMap.cc b/src/test/librbd/test_ObjectMap.cc
index 5457b99..b3b19e4 100644
--- a/src/test/librbd/test_ObjectMap.cc
+++ b/src/test/librbd/test_ObjectMap.cc
@@ -2,6 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 #include "test/librbd/test_fixture.h"
 #include "test/librbd/test_support.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
@@ -14,6 +15,13 @@ void register_test_object_map() {
 
 class TestObjectMap : public TestFixture {
 public:
+
+  int when_open_object_map(librbd::ImageCtx *ictx) {
+    C_SaferCond ctx;
+    librbd::ObjectMap object_map(*ictx, ictx->snap_id);
+    object_map.open(&ctx);
+    return ctx.wait();
+  }
 };
 
 TEST_F(TestObjectMap, RefreshInvalidatesWhenCorrupt) {
@@ -23,21 +31,19 @@ TEST_F(TestObjectMap, RefreshInvalidatesWhenCorrupt) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_FALSE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 
+  C_SaferCond lock_ctx;
   {
     RWLock::WLocker owner_locker(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
+    ictx->exclusive_lock->try_lock(&lock_ctx);
   }
+  ASSERT_EQ(0, lock_ctx.wait());
 
   std::string oid = librbd::ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP);
   bufferlist bl;
   bl.append("corrupt");
   ASSERT_EQ(0, ictx->data_ctx.write_full(oid, bl));
 
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    ictx->object_map.refresh(CEPH_NOSNAP);
-  }
+  ASSERT_EQ(0, when_open_object_map(ictx));
   ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 }
 
@@ -48,10 +54,12 @@ TEST_F(TestObjectMap, RefreshInvalidatesWhenTooSmall) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_FALSE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 
+  C_SaferCond lock_ctx;
   {
     RWLock::WLocker owner_locker(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
+    ictx->exclusive_lock->try_lock(&lock_ctx);
   }
+  ASSERT_EQ(0, lock_ctx.wait());
 
   librados::ObjectWriteOperation op;
   librbd::cls_client::object_map_resize(&op, 0, OBJECT_NONEXISTENT);
@@ -59,11 +67,7 @@ TEST_F(TestObjectMap, RefreshInvalidatesWhenTooSmall) {
   std::string oid = librbd::ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP);
   ASSERT_EQ(0, ictx->data_ctx.operate(oid, &op));
 
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    ictx->object_map.refresh(CEPH_NOSNAP);
-  }
+  ASSERT_EQ(0, when_open_object_map(ictx));
   ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 }
 
@@ -74,21 +78,19 @@ TEST_F(TestObjectMap, InvalidateFlagOnDisk) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_FALSE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 
+  C_SaferCond lock_ctx;
   {
     RWLock::WLocker owner_locker(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
+    ictx->exclusive_lock->try_lock(&lock_ctx);
   }
+  ASSERT_EQ(0, lock_ctx.wait());
 
   std::string oid = librbd::ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP);
   bufferlist bl;
   bl.append("corrupt");
   ASSERT_EQ(0, ictx->data_ctx.write_full(oid, bl));
 
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    ictx->object_map.refresh(CEPH_NOSNAP);
-  }
+  ASSERT_EQ(0, when_open_object_map(ictx));
   ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
@@ -110,11 +112,7 @@ TEST_F(TestObjectMap, InvalidateFlagInMemoryOnly) {
   corrupt_bl.append("corrupt");
   ASSERT_EQ(0, ictx->data_ctx.write_full(oid, corrupt_bl));
 
-  {
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::WLocker snap_locker(ictx->snap_lock);
-    ictx->object_map.refresh(CEPH_NOSNAP);
-  }
+  ASSERT_EQ(0, when_open_object_map(ictx));
   ASSERT_TRUE(ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID));
 
   ASSERT_EQ(0, ictx->data_ctx.write_full(oid, valid_bl));
diff --git a/src/test/librbd/test_fixture.cc b/src/test/librbd/test_fixture.cc
index e3db5c9..8c6a4b9 100644
--- a/src/test/librbd/test_fixture.cc
+++ b/src/test/librbd/test_fixture.cc
@@ -3,6 +3,10 @@
 #include "test/librbd/test_fixture.h"
 #include "test/librbd/test_support.h"
 #include "include/stringify.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageState.h"
+#include "librbd/ImageWatcher.h"
 #include "cls/lock/cls_lock_client.h"
 #include "cls/lock/cls_lock_types.h"
 #include "librbd/internal.h"
@@ -44,7 +48,7 @@ void TestFixture::TearDown() {
   unlock_image();
   for (std::set<librbd::ImageCtx *>::iterator iter = m_ictxs.begin();
        iter != m_ictxs.end(); ++iter) {
-    librbd::close_image(*iter);
+    (*iter)->state->close();
   }
 
   m_ioctx.close();
@@ -54,12 +58,14 @@ int TestFixture::open_image(const std::string &image_name,
 			    librbd::ImageCtx **ictx) {
   *ictx = new librbd::ImageCtx(image_name.c_str(), "", NULL, m_ioctx, false);
   m_ictxs.insert(*ictx);
-  return librbd::open_image(*ictx);
+
+  return (*ictx)->state->open();
 }
 
 void TestFixture::close_image(librbd::ImageCtx *ictx) {
   m_ictxs.erase(ictx);
-  librbd::close_image(ictx);
+
+  ictx->state->close();
 }
 
 int TestFixture::lock_image(librbd::ImageCtx &ictx, ClsLockType lock_type,
@@ -83,3 +89,14 @@ int TestFixture::unlock_image() {
   }
   return r;
 }
+
+int TestFixture::acquire_exclusive_lock(librbd::ImageCtx &ictx) {
+  int r = ictx.aio_work_queue->write(0, 0, "", 0);
+  if (r != 0) {
+    return r;
+  }
+
+  RWLock::RLocker owner_locker(ictx.owner_lock);
+  assert(ictx.exclusive_lock != nullptr);
+  return ictx.exclusive_lock->is_lock_owner() ? 0 : -EINVAL;
+}
diff --git a/src/test/librbd/test_fixture.h b/src/test/librbd/test_fixture.h
index 7e3fff8..5d6b86e 100644
--- a/src/test/librbd/test_fixture.h
+++ b/src/test/librbd/test_fixture.h
@@ -30,6 +30,8 @@ public:
                  const std::string &cookie);
   int unlock_image();
 
+  int acquire_exclusive_lock(librbd::ImageCtx &ictx);
+
   static std::string _pool_name;
   static librados::Rados _rados;
   static uint64_t _image_number;
diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc
index e35e90f..61d58be 100644
--- a/src/test/librbd/test_internal.cc
+++ b/src/test/librbd/test_internal.cc
@@ -5,6 +5,7 @@
 #include "librbd/AioCompletion.h"
 #include "librbd/AioImageRequest.h"
 #include "librbd/AioImageRequestWQ.h"
+#include "librbd/ExclusiveLock.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
@@ -79,11 +80,12 @@ TEST_F(TestInternal, IsExclusiveLockOwner) {
   ASSERT_EQ(0, librbd::is_exclusive_lock_owner(ictx, &is_owner));
   ASSERT_FALSE(is_owner);
 
+  C_SaferCond ctx;
   {
     RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
+    ictx->exclusive_lock->try_lock(&ctx);
   }
-
+  ASSERT_EQ(0, ctx.wait());
   ASSERT_EQ(0, librbd::is_exclusive_lock_owner(ictx, &is_owner));
   ASSERT_TRUE(is_owner);
 }
@@ -252,9 +254,8 @@ TEST_F(TestInternal, AioWriteRequestsLock) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE, "manually locked"));
 
   std::string buffer(256, '1');
-  DummyContext *ctx = new DummyContext();
-  librbd::AioCompletion *c =
-    librbd::aio_create_completion_internal(ctx, librbd::rbd_ctx_cb);
+  Context *ctx = new DummyContext();
+  librbd::AioCompletion *c = librbd::AioCompletion::create(ctx);
   c->get();
   ictx->aio_work_queue->aio_write(c, 0, buffer.size(), buffer.c_str(), 0);
 
@@ -275,9 +276,8 @@ TEST_F(TestInternal, AioDiscardRequestsLock) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE, "manually locked"));
 
-  DummyContext *ctx = new DummyContext();
-  librbd::AioCompletion *c =
-    librbd::aio_create_completion_internal(ctx, librbd::rbd_ctx_cb);
+  Context *ctx = new DummyContext();
+  librbd::AioCompletion *c = librbd::AioCompletion::create(ctx);
   c->get();
   ictx->aio_work_queue->aio_discard(c, 0, 256);
 
@@ -297,10 +297,16 @@ TEST_F(TestInternal, CancelAsyncResize) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
+  C_SaferCond ctx;
   {
     RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
-    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+    ictx->exclusive_lock->try_lock(&ctx);
+  }
+
+  ASSERT_EQ(0, ctx.wait());
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->exclusive_lock->is_lock_owner());
   }
 
   uint64_t size;
@@ -314,7 +320,7 @@ TEST_F(TestInternal, CancelAsyncResize) {
     size -= MIN(size, 1<<18);
     {
       RWLock::RLocker l(ictx->owner_lock);
-      ASSERT_EQ(0, librbd::async_resize(ictx, &ctx, size, prog_ctx));
+      librbd::async_resize(ictx, &ctx, size, prog_ctx);
     }
 
     // try to interrupt the in-progress resize
@@ -333,12 +339,16 @@ TEST_F(TestInternal, MultipleResize) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
-  {
-    RWLock::WLocker l(ictx->owner_lock);
-    if (ictx->image_watcher->is_lock_supported()) {
-      ASSERT_EQ(0, ictx->image_watcher->try_lock());
-      ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+  if (ictx->exclusive_lock != nullptr) {
+    C_SaferCond ctx;
+    {
+      RWLock::WLocker l(ictx->owner_lock);
+      ictx->exclusive_lock->try_lock(&ctx);
     }
+
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_EQ(0, ctx.wait());
+    ASSERT_TRUE(ictx->exclusive_lock->is_lock_owner());
   }
 
   uint64_t size;
@@ -358,8 +368,7 @@ TEST_F(TestInternal, MultipleResize) {
 
     RWLock::RLocker l(ictx->owner_lock);
     contexts.push_back(new C_SaferCond());
-    ASSERT_EQ(0, librbd::async_resize(ictx, contexts.back(), new_size,
-                                      prog_ctx));
+    librbd::async_resize(ictx, contexts.back(), new_size, prog_ctx);
   }
 
   for (uint32_t i = 0; i < contexts.size(); ++i) {
@@ -509,8 +518,7 @@ TEST_F(TestInternal, SnapshotCopyup)
   bufferlist read_bl;
   read_bl.push_back(read_ptr);
 
-  std::list<std::string> snaps = boost::assign::list_of(
-    "snap1")("snap2")("");
+  std::list<std::string> snaps = {"snap1", "snap2", ""};
   for (std::list<std::string>::iterator it = snaps.begin();
        it != snaps.end(); ++it) {
     const char *snap_name = it->empty() ? NULL : it->c_str();
@@ -533,8 +541,14 @@ TEST_F(TestInternal, SnapshotCopyup)
           it != snaps.begin() && snap_name != NULL) {
         state = OBJECT_EXISTS_CLEAN;
       }
+
+      librbd::ObjectMap object_map(*ictx2, ictx2->snap_id);
+      C_SaferCond ctx;
+      object_map.open(&ctx);
+      ASSERT_EQ(0, ctx.wait());
+
       RWLock::WLocker object_map_locker(ictx2->object_map_lock);
-      ASSERT_EQ(state, ictx2->object_map[0]);
+      ASSERT_EQ(state, object_map[0]);
     }
   }
 }
@@ -662,8 +676,7 @@ TEST_F(TestInternal, ShrinkFlushesCache) {
   ictx->aio_work_queue->write(0, buffer.size(), buffer.c_str(), 0);
 
   C_SaferCond cond_ctx;
-  librbd::AioCompletion *c =
-    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  librbd::AioCompletion *c = librbd::AioCompletion::create(&cond_ctx);
   c->get();
   ictx->aio_work_queue->aio_write(c, 0, buffer.size(), buffer.c_str(), 0);
 
@@ -742,3 +755,62 @@ TEST_F(TestInternal, ImageOptions) {
 
   librbd::image_options_destroy(opts2);
 }
+
+TEST_F(TestInternal, WriteFullCopyup) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  librbd::NoOpProgressContext no_op;
+  ASSERT_EQ(0, librbd::resize(ictx, 1 << ictx->order, no_op));
+
+  bufferlist bl;
+  bl.append(std::string(1 << ictx->order, '1'));
+  ASSERT_EQ(bl.length(),
+            ictx->aio_work_queue->write(0, bl.length(), bl.c_str(), 0));
+  ASSERT_EQ(0, librbd::flush(ictx));
+
+  ASSERT_EQ(0, create_snapshot("snap1", true));
+
+  std::string clone_name = get_temp_image_name();
+  int order = ictx->order;
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+                             clone_name.c_str(), ictx->features, &order, 0, 0));
+
+  TestInternal *parent = this;
+  librbd::ImageCtx *ictx2 = NULL;
+  BOOST_SCOPE_EXIT( (&m_ioctx) (clone_name) (parent) (&ictx2) ) {
+    if (ictx2 != NULL) {
+      librbd::snap_remove(ictx2, "snap1");
+      parent->close_image(ictx2);
+    }
+
+    librbd::NoOpProgressContext remove_no_op;
+    ASSERT_EQ(0, librbd::remove(m_ioctx, clone_name.c_str(), remove_no_op));
+  } BOOST_SCOPE_EXIT_END;
+
+  ASSERT_EQ(0, open_image(clone_name, &ictx2));
+  ASSERT_EQ(0, librbd::snap_create(ictx2, "snap1"));
+
+  bufferlist write_full_bl;
+  write_full_bl.append(std::string(1 << ictx2->order, '2'));
+  ASSERT_EQ(write_full_bl.length(),
+            ictx2->aio_work_queue->write(0, write_full_bl.length(),
+            write_full_bl.c_str(), 0));
+
+  ASSERT_EQ(0, librbd::flatten(ictx2, no_op));
+
+  bufferptr read_ptr(bl.length());
+  bufferlist read_bl;
+  read_bl.push_back(read_ptr);
+
+  ASSERT_EQ(read_bl.length(), ictx2->aio_work_queue->read(0, read_bl.length(),
+                                                          read_bl.c_str(), 0));
+  ASSERT_TRUE(write_full_bl.contents_equal(read_bl));
+
+  ASSERT_EQ(0, librbd::snap_set(ictx2, "snap1"));
+  ASSERT_EQ(read_bl.length(), ictx2->aio_work_queue->read(0, read_bl.length(),
+                                                          read_bl.c_str(), 0));
+  ASSERT_TRUE(bl.contents_equal(read_bl));
+}
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index 70d8391..7b84d35 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -17,6 +17,7 @@
 #include "include/rbd_types.h"
 #include "include/rbd/librbd.h"
 #include "include/rbd/librbd.hpp"
+#include "include/event_type.h"
 
 #include "common/Thread.h"
 
@@ -27,6 +28,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#include <poll.h>
 #include <time.h>
 #include <unistd.h>
 #include <iostream>
@@ -44,6 +46,10 @@
 #include <boost/assign/list_of.hpp>
 #include <boost/scope_exit.hpp>
 
+#ifdef HAVE_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
 using namespace std;
 
 #define ASSERT_PASSED(x, args...) \
@@ -75,6 +81,22 @@ static int get_features(bool *old_format, uint64_t *features)
   return 0;
 }
 
+static int get_image_id(librbd::Image &image, std::string *image_id)
+{
+  librbd::image_info_t info;
+  int r = image.stat(info, sizeof(info));
+  if (r < 0) {
+    return r;
+  }
+
+  char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+  strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+  prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+  *image_id = std::string(prefix + strlen(RBD_DATA_PREFIX));
+  return 0;
+}
+
 static int create_image_full(rados_ioctx_t ioctx, const char *name,
 			      uint64_t size, int *order, int old_format,
 			      uint64_t features)
@@ -723,6 +745,41 @@ void simple_read_cb(rbd_completion_t cb, void *arg)
   printf("read completion cb called!\n");
 }
 
+void aio_write_test_data_and_poll(rbd_image_t image, int fd, const char *test_data,
+                                  uint64_t off, size_t len, uint32_t iohint, bool *passed)
+{
+  rbd_completion_t comp;
+  uint64_t data = 0x123;
+  rbd_aio_create_completion((void*)&data, (rbd_callback_t) simple_write_cb, &comp);
+  printf("created completion\n");
+  printf("started write\n");
+  if (iohint)
+    rbd_aio_write2(image, off, len, test_data, comp, iohint);
+  else
+    rbd_aio_write(image, off, len, test_data, comp);
+
+  struct pollfd pfd;
+  pfd.fd = fd;
+  pfd.events = POLLIN;
+
+  ASSERT_EQ(1, poll(&pfd, 1, -1));
+  ASSERT_TRUE(pfd.revents & POLLIN);
+
+  rbd_completion_t comps[1];
+  ASSERT_EQ(1, rbd_poll_io_events(image, comps, 1));
+  uint64_t count;
+  ASSERT_EQ(static_cast<ssize_t>(sizeof(count)),
+            read(fd, &count, sizeof(count)));
+  int r = rbd_aio_get_return_value(comps[0]);
+  ASSERT_TRUE(rbd_aio_is_complete(comps[0]));
+  ASSERT_TRUE(*(uint64_t*)rbd_aio_get_arg(comps[0]) == data);
+  printf("return value is: %d\n", r);
+  ASSERT_EQ(0, r);
+  printf("finished write\n");
+  rbd_aio_release(comps[0]);
+  *passed = true;
+}
+
 void aio_write_test_data(rbd_image_t image, const char *test_data, uint64_t off, size_t len, uint32_t iohint, bool *passed)
 {
   rbd_completion_t comp;
@@ -776,6 +833,47 @@ void discard_test_data(rbd_image_t image, uint64_t off, size_t len, bool *passed
   *passed = true;
 }
 
+void aio_read_test_data_and_poll(rbd_image_t image, int fd, const char *expected,
+                                 uint64_t off, size_t len, uint32_t iohint, bool *passed)
+{
+  rbd_completion_t comp;
+  char *result = (char *)malloc(len + 1);
+
+  ASSERT_NE(static_cast<char *>(NULL), result);
+  rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
+  printf("created completion\n");
+  printf("started read\n");
+  if (iohint)
+    rbd_aio_read2(image, off, len, result, comp, iohint);
+  else
+    rbd_aio_read(image, off, len, result, comp);
+
+  struct pollfd pfd;
+  pfd.fd = fd;
+  pfd.events = POLLIN;
+
+  ASSERT_EQ(1, poll(&pfd, 1, -1));
+  ASSERT_TRUE(pfd.revents & POLLIN);
+
+  rbd_completion_t comps[1];
+  ASSERT_EQ(1, rbd_poll_io_events(image, comps, 1));
+  uint64_t count;
+  ASSERT_EQ(static_cast<ssize_t>(sizeof(count)),
+            read(fd, &count, sizeof(count)));
+
+  int r = rbd_aio_get_return_value(comps[0]);
+  ASSERT_TRUE(rbd_aio_is_complete(comps[0]));
+  printf("return value is: %d\n", r);
+  ASSERT_EQ(len, static_cast<size_t>(r));
+  rbd_aio_release(comps[0]);
+  if (memcmp(result, expected, len)) {
+    printf("read: %s\nexpected: %s\n", result, expected);
+    ASSERT_EQ(0, memcmp(result, expected, len));
+  }
+  free(result);
+  *passed = true;
+}
+
 void aio_read_test_data(rbd_image_t image, const char *expected, uint64_t off, size_t len, uint32_t iohint, bool *passed)
 {
   rbd_completion_t comp;
@@ -2697,6 +2795,90 @@ TEST_F(TestLibRBD, TestPendingAio)
   rados_ioctx_destroy(ioctx);
 }
 
+TEST_F(TestLibRBD, RebuildObjectMapViaLockOwner)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  std::string object_map_oid;
+  {
+    librbd::Image image;
+    ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+    std::string image_id;
+    ASSERT_EQ(0, get_image_id(image, &image_id));
+    object_map_oid = RBD_OBJECT_MAP_PREFIX + image_id;
+  }
+
+  // corrupt the object map
+  bufferlist bl;
+  bl.append("foo");
+  ASSERT_EQ(0, ioctx.write(object_map_oid, bl, bl.length(), 0));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  bool lock_owner;
+  bl.clear();
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+
+  uint64_t flags;
+  ASSERT_EQ(0, image1.get_flags(&flags));
+  ASSERT_TRUE((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0);
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+  ASSERT_EQ(0, image2.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_FALSE(lock_owner);
+
+  PrintProgress prog_ctx;
+  ASSERT_EQ(0, image2.rebuild_object_map(prog_ctx));
+  ASSERT_PASSED(validate_object_map, image1);
+  ASSERT_PASSED(validate_object_map, image2);
+}
+
+TEST_F(TestLibRBD, RenameViaLockOwner)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  bufferlist bl;
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+
+  bool lock_owner;
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+
+  std::string new_name = get_temp_image_name();
+  ASSERT_EQ(0, rbd.rename(ioctx, name.c_str(), new_name.c_str()));
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, new_name.c_str(), NULL));
+}
+
 TEST_F(TestLibRBD, SnapCreateViaLockOwner)
 {
   REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
@@ -2772,6 +2954,129 @@ TEST_F(TestLibRBD, SnapRemoveViaLockOwner)
   ASSERT_TRUE(lock_owner);
 }
 
+TEST_F(TestLibRBD, SnapRenameViaLockOwner)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  bufferlist bl;
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+  ASSERT_EQ(0, image1.snap_create("snap1"));
+
+  bool lock_owner;
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+  ASSERT_EQ(0, image2.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_FALSE(lock_owner);
+
+  ASSERT_EQ(0, image2.snap_rename("snap1", "snap1-rename"));
+  ASSERT_TRUE(image1.snap_exists("snap1-rename"));
+  ASSERT_TRUE(image2.snap_exists("snap1-rename"));
+
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+}
+
+TEST_F(TestLibRBD, SnapProtectViaLockOwner)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  bufferlist bl;
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+
+  bool lock_owner;
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+  ASSERT_EQ(0, image1.snap_create("snap1"));
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+  ASSERT_EQ(0, image2.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_FALSE(lock_owner);
+
+  ASSERT_EQ(0, image2.snap_protect("snap1"));
+  bool is_protected;
+  ASSERT_EQ(0, image2.snap_is_protected("snap1", &is_protected));
+  ASSERT_TRUE(is_protected);
+  ASSERT_EQ(0, image1.snap_is_protected("snap1", &is_protected));
+  ASSERT_TRUE(is_protected);
+
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+}
+
+TEST_F(TestLibRBD, SnapUnprotectViaLockOwner)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  bufferlist bl;
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+
+  bool lock_owner;
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+  ASSERT_EQ(0, image1.snap_create("snap1"));
+  ASSERT_EQ(0, image1.snap_protect("snap1"));
+  bool is_protected;
+  ASSERT_EQ(0, image1.snap_is_protected("snap1", &is_protected));
+  ASSERT_TRUE(is_protected);
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+  ASSERT_EQ(0, image2.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_FALSE(lock_owner);
+
+  ASSERT_EQ(0, image2.snap_unprotect("snap1"));
+  ASSERT_EQ(0, image2.snap_is_protected("snap1", &is_protected));
+  ASSERT_FALSE(is_protected);
+  ASSERT_EQ(0, image1.snap_is_protected("snap1", &is_protected));
+  ASSERT_FALSE(is_protected);
+
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+}
+
 TEST_F(TestLibRBD, FlattenViaLockOwner)
 {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
@@ -3090,14 +3395,8 @@ TEST_F(TestLibRBD, RebuildObjectMap)
     ASSERT_EQ(0, image.snap_create("snap1"));
     ASSERT_EQ(bl.length(), image.write(1<<order, bl.length(), bl));
 
-    librbd::image_info_t info;
-    ASSERT_EQ(0, image.stat(info, sizeof(info)));
-
-    char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
-    strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
-    prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
-
-    std::string image_id(prefix + strlen(RBD_DATA_PREFIX));
+    std::string image_id;
+    ASSERT_EQ(0, get_image_id(image, &image_id));
     object_map_oid = RBD_OBJECT_MAP_PREFIX + image_id;
   }
 
@@ -3107,6 +3406,12 @@ TEST_F(TestLibRBD, RebuildObjectMap)
   librbd::Image image1;
   ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
 
+  bool lock_owner;
+  bl.clear();
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+
   uint64_t flags;
   ASSERT_EQ(0, image1.get_flags(&flags));
   ASSERT_TRUE((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0);
@@ -3130,7 +3435,7 @@ TEST_F(TestLibRBD, RebuildObjectMap)
 
 TEST_F(TestLibRBD, RebuildNewObjectMap)
 {
-  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+  REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
 
   rados_ioctx_t ioctx;
   rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
@@ -3475,3 +3780,105 @@ TEST_F(TestLibRBD, TestImageOptionsPP)
 
   ASSERT_EQ(0, parent.close());
 }
+
+TEST_F(TestLibRBD, ImagePollIO)
+{
+#ifdef HAVE_EVENTFD
+  rados_ioctx_t ioctx;
+  rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
+
+  rbd_image_t image;
+  int order = 0;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int fd = eventfd(0, EFD_NONBLOCK);
+
+  ASSERT_EQ(0, create_image(ioctx, name.c_str(), size, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, name.c_str(), &image, NULL));
+
+  ASSERT_EQ(0, rbd_set_image_notification(image, fd, EVENT_SOCKET_TYPE_EVENTFD));
+
+  char test_data[TEST_IO_SIZE + 1];
+  char zero_data[TEST_IO_SIZE + 1];
+  int i;
+
+  for (i = 0; i < TEST_IO_SIZE; ++i)
+    test_data[i] = (char) (rand() % (126 - 33) + 33);
+  test_data[TEST_IO_SIZE] = '\0';
+  memset(zero_data, 0, sizeof(zero_data));
+
+  for (i = 0; i < 5; ++i)
+    ASSERT_PASSED(write_test_data, image, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
+
+  for (i = 5; i < 10; ++i)
+    ASSERT_PASSED(aio_write_test_data_and_poll, image, fd, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
+
+  for (i = 5; i < 10; ++i)
+    ASSERT_PASSED(aio_read_test_data_and_poll, image, fd, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
+
+  ASSERT_EQ(0, rbd_close(image));
+  rados_ioctx_destroy(ioctx);
+#endif
+}
+
+namespace librbd {
+
+static bool operator==(const mirror_peer_t &lhs, const mirror_peer_t &rhs) {
+  return (lhs.cluster_uuid == rhs.cluster_uuid &&
+          lhs.cluster_name == rhs.cluster_name &&
+          lhs.client_name == rhs.client_name);
+}
+
+} // namespace librbd
+
+TEST_F(TestLibRBD, Mirror) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+
+  std::vector<librbd::mirror_peer_t> expected_peers;
+  std::vector<librbd::mirror_peer_t> peers;
+  ASSERT_EQ(0, rbd.mirror_peer_list(ioctx, &peers));
+  ASSERT_EQ(expected_peers, peers);
+
+  ASSERT_EQ(-EINVAL, rbd.mirror_peer_add(ioctx, "uuid1", "cluster1", "client"));
+
+  bool enabled;
+  ASSERT_EQ(0, rbd.mirror_is_enabled(ioctx, &enabled));
+  ASSERT_FALSE(enabled);
+  ASSERT_EQ(0, rbd.mirror_set_enabled(ioctx, true));
+  ASSERT_EQ(0, rbd.mirror_is_enabled(ioctx, &enabled));
+  ASSERT_TRUE(enabled);
+
+  ASSERT_EQ(0, rbd.mirror_peer_add(ioctx, "uuid1", "cluster1", "client"));
+  ASSERT_EQ(0, rbd.mirror_peer_add(ioctx, "uuid2", "cluster2", "admin"));
+  ASSERT_EQ(-EEXIST, rbd.mirror_peer_add(ioctx, "uuid2", "cluster3", "foo"));
+  ASSERT_EQ(-EEXIST, rbd.mirror_peer_add(ioctx, "uuid3", "cluster1", "foo"));
+  ASSERT_EQ(0, rbd.mirror_peer_add(ioctx, "uuid3", "cluster3", "admin"));
+
+  ASSERT_EQ(0, rbd.mirror_peer_list(ioctx, &peers));
+  expected_peers = {
+    {"uuid1", "cluster1", "client"},
+    {"uuid2", "cluster2", "admin"},
+    {"uuid3", "cluster3", "admin"}};
+  ASSERT_EQ(expected_peers, peers);
+
+  ASSERT_EQ(0, rbd.mirror_peer_remove(ioctx, "uuid4"));
+  ASSERT_EQ(0, rbd.mirror_peer_remove(ioctx, "uuid2"));
+
+  ASSERT_EQ(-ENOENT, rbd.mirror_peer_set_client(ioctx, "uuid4", "new client"));
+  ASSERT_EQ(0, rbd.mirror_peer_set_client(ioctx, "uuid1", "new client"));
+
+  ASSERT_EQ(-ENOENT, rbd.mirror_peer_set_cluster(ioctx, "uuid4",
+                                                 "new cluster"));
+  ASSERT_EQ(0, rbd.mirror_peer_set_cluster(ioctx, "uuid3", "new cluster"));
+
+  ASSERT_EQ(0, rbd.mirror_peer_list(ioctx, &peers));
+  expected_peers = {
+    {"uuid1", "cluster1", "new client"},
+    {"uuid3", "new cluster", "admin"}};
+  ASSERT_EQ(expected_peers, peers);
+
+  ASSERT_EQ(-EBUSY, rbd.mirror_set_enabled(ioctx, false));
+}
diff --git a/src/test/librbd/test_mock_ExclusiveLock.cc b/src/test/librbd/test_mock_ExclusiveLock.cc
new file mode 100644
index 0000000..cac28a6
--- /dev/null
+++ b/src/test/librbd/test_mock_ExclusiveLock.cc
@@ -0,0 +1,556 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/exclusive_lock/AcquireRequest.h"
+#include "librbd/exclusive_lock/ReleaseRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <list>
+
+namespace librbd {
+namespace exclusive_lock {
+
+template<typename T>
+struct BaseRequest {
+  static std::list<T *> s_requests;
+  Context *on_lock_unlock;
+  Context *on_finish;
+
+  static T* create(MockImageCtx &image_ctx, const std::string &cookie,
+                   Context *on_lock_unlock, Context *on_finish) {
+    assert(!s_requests.empty());
+    T* req = s_requests.front();
+    req->on_lock_unlock = on_lock_unlock;
+    req->on_finish = on_finish;
+    s_requests.pop_front();
+    return req;
+  }
+
+  BaseRequest() {
+    s_requests.push_back(reinterpret_cast<T*>(this));
+  }
+};
+
+template<typename T>
+std::list<T *> BaseRequest<T>::s_requests;
+
+template <>
+struct AcquireRequest<MockImageCtx> : public BaseRequest<AcquireRequest<MockImageCtx> > {
+  MOCK_METHOD0(send, void());
+};
+
+template <>
+struct ReleaseRequest<MockImageCtx> : public BaseRequest<ReleaseRequest<MockImageCtx> > {
+  MOCK_METHOD0(send, void());
+};
+
+} // namespace exclusive_lock
+} // namespace librbd
+
+// template definitions
+#include "librbd/ExclusiveLock.cc"
+template class librbd::ExclusiveLock<librbd::MockImageCtx>;
+
+namespace librbd {
+
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::InSequence;
+using ::testing::Return;
+
+class TestMockExclusiveLock : public TestMockFixture {
+public:
+  typedef ExclusiveLock<MockImageCtx> MockExclusiveLock;
+  typedef exclusive_lock::AcquireRequest<MockImageCtx> MockAcquireRequest;
+  typedef exclusive_lock::ReleaseRequest<MockImageCtx> MockReleaseRequest;
+
+  void expect_get_watch_handle(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.image_watcher, get_watch_handle())
+                  .WillRepeatedly(Return(1234567890));
+  }
+
+  void expect_block_writes(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, block_writes(_))
+                  .WillOnce(CompleteContext(0, mock_image_ctx.image_ctx->op_work_queue));
+  }
+
+  void expect_unblock_writes(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, unblock_writes());
+  }
+
+  void expect_acquire_lock(MockImageCtx &mock_image_ctx,
+                           MockAcquireRequest &acquire_request, int r) {
+    expect_get_watch_handle(mock_image_ctx);
+    EXPECT_CALL(acquire_request, send())
+                  .WillOnce(FinishRequest(&acquire_request, r, &mock_image_ctx));
+    if (r == 0) {
+      expect_notify_acquired_lock(mock_image_ctx);
+    }
+  }
+
+  void expect_release_lock(MockImageCtx &mock_image_ctx,
+                           MockReleaseRequest &release_request, int r,
+                           bool shutting_down = false) {
+    EXPECT_CALL(release_request, send())
+                  .WillOnce(FinishRequest(&release_request, r, &mock_image_ctx));
+    if (r == 0) {
+      expect_notify_released_lock(mock_image_ctx);
+      expect_writes_empty(mock_image_ctx);
+    }
+  }
+
+  void expect_notify_request_lock(MockImageCtx &mock_image_ctx,
+                                  MockExclusiveLock &mock_exclusive_lock) {
+    EXPECT_CALL(*mock_image_ctx.image_watcher, notify_request_lock())
+                  .WillRepeatedly(Invoke(&mock_exclusive_lock,
+                                         &MockExclusiveLock::handle_lock_released));
+  }
+
+  void expect_notify_acquired_lock(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.image_watcher, notify_acquired_lock())
+                  .Times(1);
+  }
+
+  void expect_notify_released_lock(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.image_watcher, notify_released_lock())
+                  .Times(1);
+  }
+
+  void expect_writes_empty(MockImageCtx &mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.aio_work_queue, writes_empty())
+                  .WillRepeatedly(Return(true));
+  }
+
+  int when_init(MockImageCtx &mock_image_ctx,
+                MockExclusiveLock &exclusive_lock) {
+    C_SaferCond ctx;
+    {
+      RWLock::WLocker owner_locker(mock_image_ctx.owner_lock);
+      exclusive_lock.init(&ctx);
+    }
+    return ctx.wait();
+  }
+
+  int when_try_lock(MockImageCtx &mock_image_ctx,
+                     MockExclusiveLock &exclusive_lock) {
+    C_SaferCond ctx;
+    {
+      RWLock::WLocker owner_locker(mock_image_ctx.owner_lock);
+      exclusive_lock.try_lock(&ctx);
+    }
+    return ctx.wait();
+  }
+  int when_request_lock(MockImageCtx &mock_image_ctx,
+                     MockExclusiveLock &exclusive_lock) {
+    C_SaferCond ctx;
+    {
+      RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+      exclusive_lock.request_lock(&ctx);
+    }
+    return ctx.wait();
+  }
+  int when_release_lock(MockImageCtx &mock_image_ctx,
+                     MockExclusiveLock &exclusive_lock) {
+    C_SaferCond ctx;
+    {
+      RWLock::WLocker owner_locker(mock_image_ctx.owner_lock);
+      exclusive_lock.release_lock(&ctx);
+    }
+    return ctx.wait();
+  }
+  int when_shut_down(MockImageCtx &mock_image_ctx,
+                     MockExclusiveLock &exclusive_lock) {
+    C_SaferCond ctx;
+    {
+      RWLock::WLocker owner_locker(mock_image_ctx.owner_lock);
+      exclusive_lock.shut_down(&ctx);
+    }
+    return ctx.wait();
+  }
+
+  bool is_lock_owner(MockImageCtx &mock_image_ctx,
+                     MockExclusiveLock &exclusive_lock) {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    return exclusive_lock.is_lock_owner();
+  }
+};
+
+TEST_F(TestMockExclusiveLock, StateTransitions) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, 0);
+  ASSERT_EQ(0, when_try_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_TRUE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest request_release;
+  expect_release_lock(mock_image_ctx, request_release, 0);
+  ASSERT_EQ(0, when_release_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest request_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, request_lock_acquire, 0);
+  ASSERT_EQ(0, when_request_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_TRUE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest shutdown_release;
+  expect_op_work_queue(mock_image_ctx);
+  expect_release_lock(mock_image_ctx, shutdown_release, 0, true);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, TryLockLockedState) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, 0);
+  ASSERT_EQ(0, when_try_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_EQ(0, when_try_lock(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest shutdown_release;
+  expect_op_work_queue(mock_image_ctx);
+  expect_release_lock(mock_image_ctx, shutdown_release, 0, true);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, TryLockAlreadyLocked) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, -EAGAIN);
+  ASSERT_EQ(0, when_try_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  expect_unblock_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, TryLockBusy) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, -EBUSY);
+  ASSERT_EQ(-EBUSY, when_try_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  expect_unblock_writes(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, TryLockError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, -EINVAL);
+
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+  ASSERT_EQ(-EINVAL, when_try_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  expect_unblock_writes(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, RequestLockLockedState) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, 0);
+  ASSERT_EQ(0, when_try_lock(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest shutdown_release;
+  expect_op_work_queue(mock_image_ctx);
+  expect_release_lock(mock_image_ctx, shutdown_release, 0, true);
+  ASSERT_EQ(0, when_request_lock(mock_image_ctx, exclusive_lock));
+
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, RequestLockBlacklist) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  // will abort after seeing blacklist error (avoid infinite request loop)
+  MockAcquireRequest request_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, request_lock_acquire, -EBLACKLISTED);
+  expect_notify_request_lock(mock_image_ctx, exclusive_lock);
+  ASSERT_EQ(-EBLACKLISTED, when_request_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  expect_unblock_writes(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, RequestLockBusy) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  // will repeat until successfully acquires the lock
+  MockAcquireRequest request_lock_acquire1;
+  expect_acquire_lock(mock_image_ctx, request_lock_acquire1, -EBUSY);
+  expect_notify_request_lock(mock_image_ctx, exclusive_lock);
+
+  MockAcquireRequest request_lock_acquire2;
+  expect_acquire_lock(mock_image_ctx, request_lock_acquire2, 0);
+  ASSERT_EQ(0, when_request_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_TRUE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest shutdown_release;
+  expect_op_work_queue(mock_image_ctx);
+  expect_release_lock(mock_image_ctx, shutdown_release, 0, true);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, RequestLockError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  // will repeat until successfully acquires the lock
+  MockAcquireRequest request_lock_acquire1;
+  expect_acquire_lock(mock_image_ctx, request_lock_acquire1, -EINVAL);
+  expect_notify_request_lock(mock_image_ctx, exclusive_lock);
+
+  MockAcquireRequest request_lock_acquire2;
+  expect_acquire_lock(mock_image_ctx, request_lock_acquire2, 0);
+  ASSERT_EQ(0, when_request_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_TRUE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest shutdown_release;
+  expect_op_work_queue(mock_image_ctx);
+  expect_release_lock(mock_image_ctx, shutdown_release, 0, true);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, ReleaseLockUnlockedState) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  ASSERT_EQ(0, when_release_lock(mock_image_ctx, exclusive_lock));
+
+  expect_unblock_writes(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, ReleaseLockError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  expect_acquire_lock(mock_image_ctx, try_lock_acquire, 0);
+  ASSERT_EQ(0, when_try_lock(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest release;
+  expect_release_lock(mock_image_ctx, release, -EINVAL);
+
+  ASSERT_EQ(-EINVAL, when_release_lock(mock_image_ctx, exclusive_lock));
+  ASSERT_TRUE(is_lock_owner(mock_image_ctx, exclusive_lock));
+
+  MockReleaseRequest shutdown_release;
+  expect_op_work_queue(mock_image_ctx);
+  expect_release_lock(mock_image_ctx, shutdown_release, 0, true);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+  ASSERT_FALSE(is_lock_owner(mock_image_ctx, exclusive_lock));
+}
+
+TEST_F(TestMockExclusiveLock, ConcurrentRequests) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_block_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  MockAcquireRequest try_lock_acquire;
+  C_SaferCond wait_for_send_ctx1;
+  expect_get_watch_handle(mock_image_ctx);
+  EXPECT_CALL(try_lock_acquire, send())
+                .WillOnce(Notify(&wait_for_send_ctx1));
+
+  MockAcquireRequest request_acquire;
+  expect_acquire_lock(mock_image_ctx, request_acquire, 0);
+
+  MockReleaseRequest release;
+  C_SaferCond wait_for_send_ctx2;
+  EXPECT_CALL(release, send())
+                .WillOnce(Notify(&wait_for_send_ctx2));
+  expect_notify_released_lock(mock_image_ctx);
+  expect_writes_empty(mock_image_ctx);
+
+  C_SaferCond try_request_ctx1;
+  {
+    RWLock::WLocker owner_locker(mock_image_ctx.owner_lock);
+    exclusive_lock.try_lock(&try_request_ctx1);
+  }
+
+  C_SaferCond request_lock_ctx1;
+  C_SaferCond request_lock_ctx2;
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    exclusive_lock.request_lock(&request_lock_ctx1);
+    exclusive_lock.request_lock(&request_lock_ctx2);
+  }
+
+  C_SaferCond release_lock_ctx1;
+  {
+    RWLock::WLocker owner_locker(mock_image_ctx.owner_lock);
+    exclusive_lock.release_lock(&release_lock_ctx1);
+  }
+
+  C_SaferCond request_lock_ctx3;
+  {
+    RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+    exclusive_lock.request_lock(&request_lock_ctx3);
+  }
+
+  // fail the try_lock
+  ASSERT_EQ(0, wait_for_send_ctx1.wait());
+  try_lock_acquire.on_finish->complete(-EINVAL);
+  ASSERT_EQ(-EINVAL, try_request_ctx1.wait());
+
+  // all three pending request locks should complete
+  ASSERT_EQ(0, request_lock_ctx1.wait());
+  ASSERT_EQ(0, request_lock_ctx2.wait());
+  ASSERT_EQ(0, request_lock_ctx3.wait());
+
+  // proceed with the release
+  ASSERT_EQ(0, wait_for_send_ctx2.wait());
+  release.on_finish->complete(0);
+  ASSERT_EQ(0, release_lock_ctx1.wait());
+
+  expect_unblock_writes(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+}
+
+} // namespace librbd
+
diff --git a/src/test/librbd/test_mock_fixture.cc b/src/test/librbd/test_mock_fixture.cc
index 1839b91..d181914 100644
--- a/src/test/librbd/test_mock_fixture.cc
+++ b/src/test/librbd/test_mock_fixture.cc
@@ -9,9 +9,11 @@
 // template definitions
 #include "librbd/AsyncRequest.cc"
 #include "librbd/AsyncObjectThrottle.cc"
+#include "librbd/operation/Request.cc"
 
 template class librbd::AsyncRequest<librbd::MockImageCtx>;
 template class librbd::AsyncObjectThrottle<librbd::MockImageCtx>;
+template class librbd::operation::Request<librbd::MockImageCtx>;
 
 using ::testing::_;
 using ::testing::DoDefault;
diff --git a/src/test/librbd/test_mock_fixture.h b/src/test/librbd/test_mock_fixture.h
index 150e312..bf8a0af 100644
--- a/src/test/librbd/test_mock_fixture.h
+++ b/src/test/librbd/test_mock_fixture.h
@@ -5,6 +5,7 @@
 #define CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H
 
 #include "test/librbd/test_fixture.h"
+#include "test/librbd/mock/MockImageCtx.h"
 #include "common/WorkQueue.h"
 #include <boost/shared_ptr.hpp>
 #include <gmock/gmock.h>
@@ -18,6 +19,10 @@ namespace librbd {
 class MockImageCtx;
 }
 
+ACTION_P(CopyInBufferlist, str) {
+  arg0->append(str);
+}
+
 ACTION_P2(CompleteContext, r, wq) {
   ContextWQ *context_wq = reinterpret_cast<ContextWQ *>(wq);
   if (context_wq != NULL) {
@@ -31,10 +36,20 @@ ACTION_P(DispatchContext, wq) {
   wq->queue(arg0, arg1);
 }
 
+ACTION_P3(FinishRequest, request, r, mock) {
+  librbd::MockImageCtx *mock_image_ctx =
+    reinterpret_cast<librbd::MockImageCtx *>(mock);
+  mock_image_ctx->image_ctx->op_work_queue->queue(request->on_finish, r);
+}
+
 ACTION_P(GetReference, ref_object) {
   ref_object->get();
 }
 
+ACTION_P(Notify, ctx) {
+  ctx->complete(0);
+}
+
 MATCHER_P(ContentsEqual, bl, "") {
   // TODO fix const-correctness of bufferlist
   return const_cast<bufferlist &>(arg).contents_equal(
@@ -51,6 +66,9 @@ public:
   virtual void SetUp();
   virtual void TearDown();
 
+  ::testing::NiceMock<librados::MockTestMemRadosClient> &get_mock_rados_client() {
+    return *s_mock_rados_client;
+  }
   librados::MockTestMemIoCtxImpl &get_mock_io_ctx(librados::IoCtx &ioctx);
 
   void expect_op_work_queue(librbd::MockImageCtx &mock_image_ctx);
diff --git a/src/test/mon/misc.sh b/src/test/mon/misc.sh
index c11c0eb..1f8f755 100755
--- a/src/test/mon/misc.sh
+++ b/src/test/mon/misc.sh
@@ -43,44 +43,58 @@ function TEST_osd_pool_get_set() {
     local flag
     for flag in hashpspool nodelete nopgchange nosizechange write_fadvise_dontneed noscrub nodeep-scrub; do
         if [ $flag = hashpspool ]; then
-	    ./ceph osd dump | grep 'pool ' | grep $flag || return 1
+	    ceph osd dump | grep 'pool ' | grep $flag || return 1
         else
-	    ! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
+	    ! ceph osd dump | grep 'pool ' | grep $flag || return 1
         fi
-	./ceph osd pool set $TEST_POOL $flag 0 || return 1
-	! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
-	./ceph osd pool set $TEST_POOL $flag 1 || return 1
-	./ceph osd dump | grep 'pool ' | grep $flag || return 1
-	./ceph osd pool set $TEST_POOL $flag false || return 1
-	! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
-	./ceph osd pool set $TEST_POOL $flag false || return 1
+	ceph osd pool set $TEST_POOL $flag 0 || return 1
+	! ceph osd dump | grep 'pool ' | grep $flag || return 1
+	ceph osd pool set $TEST_POOL $flag 1 || return 1
+	ceph osd dump | grep 'pool ' | grep $flag || return 1
+	ceph osd pool set $TEST_POOL $flag false || return 1
+	! ceph osd dump | grep 'pool ' | grep $flag || return 1
+	ceph osd pool set $TEST_POOL $flag false || return 1
         # check that setting false twice does not toggle to true (bug)
-	! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
-	./ceph osd pool set $TEST_POOL $flag true || return 1
-	./ceph osd dump | grep 'pool ' | grep $flag || return 1
+	! ceph osd dump | grep 'pool ' | grep $flag || return 1
+	ceph osd pool set $TEST_POOL $flag true || return 1
+	ceph osd dump | grep 'pool ' | grep $flag || return 1
 	# cleanup
-	./ceph osd pool set $TEST_POOL $flag 0 || return 1
+	ceph osd pool set $TEST_POOL $flag 0 || return 1
     done
 
-    local size=$(./ceph osd pool get $TEST_POOL size|awk '{print $2}')
-    local min_size=$(./ceph osd pool get $TEST_POOL min_size|awk '{print $2}')
+    local size=$(ceph osd pool get $TEST_POOL size|awk '{print $2}')
+    local min_size=$(ceph osd pool get $TEST_POOL min_size|awk '{print $2}')
+
+    ceph osd pool set $TEST_POOL scrub_min_interval 123456 || return 1
+    ceph osd dump | grep 'pool ' | grep 'scrub_min_interval 123456' || return 1
+    ceph osd pool set $TEST_POOL scrub_min_interval 0 || return 1
+    ceph osd dump | grep 'pool ' | grep 'scrub_min_interval' && return 1
+    ceph osd pool set $TEST_POOL scrub_max_interval 123456 || return 1
+    ceph osd dump | grep 'pool ' | grep 'scrub_max_interval 123456' || return 1
+    ceph osd pool set $TEST_POOL scrub_max_interval 0 || return 1
+    ceph osd dump | grep 'pool ' | grep 'scrub_max_interval' && return 1
+    ceph osd pool set $TEST_POOL deep_scrub_interval 123456 || return 1
+    ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval 123456' || return 1
+    ceph osd pool set $TEST_POOL deep_scrub_interval 0 || return 1
+    ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval' && return 1
+
     #replicated pool size restrict in 1 and 10
-    ! ./ceph osd pool set $TEST_POOL 11 || return 1
+    ! ceph osd pool set $TEST_POOL 11 || return 1
     #replicated pool min_size must be between in 1 and size
-    ! ./ceph osd pool set $TEST_POOL min_size $(expr $size + 1) || return 1
-    ! ./ceph osd pool set $TEST_POOL min_size 0 || return 1
+    ! ceph osd pool set $TEST_POOL min_size $(expr $size + 1) || return 1
+    ! ceph osd pool set $TEST_POOL min_size 0 || return 1
 
     local ecpool=erasepool
-    ./ceph osd pool create $ecpool 12 12 erasure default || return 1
+    ceph osd pool create $ecpool 12 12 erasure default || return 1
     #erasue pool size=k+m, min_size=k
-    local size=$(./ceph osd pool get $ecpool size|awk '{print $2}')
-    local k=$(./ceph osd pool get $ecpool min_size|awk '{print $2}')
+    local size=$(ceph osd pool get $ecpool size|awk '{print $2}')
+    local k=$(ceph osd pool get $ecpool min_size|awk '{print $2}')
     #erasure pool size can't change
-    ! ./ceph osd pool set $ecpool size  $(expr $size + 1) || return 1
+    ! ceph osd pool set $ecpool size  $(expr $size + 1) || return 1
     #erasure pool min_size must be between in k and size
-    ./ceph osd pool set $ecpool min_size $(expr $k + 1) || return 1
-    ! ./ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1
-    ! ./ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1
+    ceph osd pool set $ecpool min_size $(expr $k + 1) || return 1
+    ! ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1
+    ! ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1
 
     teardown $dir || return 1
 }
@@ -118,6 +132,27 @@ function TEST_mon_add_to_single_mon() {
     teardown $dir || return 1
 }
 
+function TEST_no_segfault_for_bad_keyring() {
+    local dir=$1
+    setup $dir || return 1
+    # create a client.admin key and add it to ceph.mon.keyring
+    ceph-authtool --create-keyring $dir/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
+    ceph-authtool --create-keyring $dir/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *'
+    ceph-authtool $dir/ceph.mon.keyring --import-keyring $dir/ceph.client.admin.keyring
+    CEPH_ARGS_TMP="--fsid=$(uuidgen) --mon-host=127.0.0.1:7102 --auth-supported=cephx "
+    CEPH_ARGS_orig=$CEPH_ARGS
+    CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/ceph.mon.keyring "
+    run_mon $dir a
+    # create a bad keyring and make sure no segfault occurs when using the bad keyring
+    echo -e "[client.admin]\nkey = BQAUlgtWoFePIxAAQ9YLzJSVgJX5V1lh5gyctg==" > $dir/bad.keyring
+    CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/bad.keyring"
+    ceph osd dump 2> /dev/null
+    # 139(11|128) means segfault and core dumped
+    [ $? -eq 139 ] && return 1
+    CEPH_ARGS=$CEPH_ARGS_orig
+    teardown $dir || return 1
+}
+
 main misc "$@"
 
 # Local Variables:
diff --git a/src/test/multi_stress_watch.cc b/src/test/multi_stress_watch.cc
index 25f7355..4dc5489 100644
--- a/src/test/multi_stress_watch.cc
+++ b/src/test/multi_stress_watch.cc
@@ -13,7 +13,6 @@
 #include <unistd.h>
 
 using namespace librados;
-using ceph::buffer;
 using std::map;
 using std::ostringstream;
 using std::string;
diff --git a/src/test/objectstore_bench.cc b/src/test/objectstore_bench.cc
index d5e9f98..097e406 100644
--- a/src/test/objectstore_bench.cc
+++ b/src/test/objectstore_bench.cc
@@ -145,6 +145,7 @@ void osbench_worker(ObjectStore *os, const Config &cfg,
       delete t;
     }
   }
+  sequencer.flush();
 }
 
 int main(int argc, const char *argv[])
@@ -204,6 +205,40 @@ int main(int argc, const char *argv[])
                           g_conf->osd_objectstore,
                           g_conf->osd_data,
                           g_conf->osd_journal));
+
+  //Checking data folder: create if needed or error if it's not empty
+  DIR *dir = ::opendir(g_conf->osd_data.c_str());
+  if (!dir) {
+    std::string cmd("mkdir -p ");
+    cmd+=g_conf->osd_data;
+    int r = ::system( cmd.c_str() );
+    if( r<0 ){
+      derr << "Failed to create data directory, ret = " << r << dendl;
+      return 1;
+    }
+  }
+  else {
+     bool non_empty = readdir(dir) != NULL && readdir(dir) != NULL && readdir(dir) != NULL;
+     if( non_empty ){
+       derr << "Data directory '"<<g_conf->osd_data<<"' isn't empty, please clean it first."<< dendl;
+       return 1;
+     }
+  }
+  ::closedir(dir);
+
+  //Create folders for journal if needed
+  string journal_base = g_conf->osd_journal.substr(0, g_conf->osd_journal.rfind('/'));
+  struct stat sb;
+  if (stat(journal_base.c_str(), &sb) != 0 ){
+    std::string cmd("mkdir -p ");
+    cmd+=journal_base;
+    int r = ::system( cmd.c_str() );
+    if( r<0 ){
+      derr << "Failed to create journal directory, ret = " << r << dendl;
+      return 1;
+    }
+  }
+
   if (!os) {
     derr << "bad objectstore type " << g_conf->osd_objectstore << dendl;
     return 1;
diff --git a/src/test/opensuse-13.2/ceph.spec.in b/src/test/opensuse-13.2/ceph.spec.in
index 2939fef..52c5c1d 100644
--- a/src/test/opensuse-13.2/ceph.spec.in
+++ b/src/test/opensuse-13.2/ceph.spec.in
@@ -43,6 +43,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # /var/run/ceph.
 %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
 %global _with_systemd 1
+%{!?tmpfiles_create: %global tmpfiles_create systemd-tmpfiles --create}
 %endif
 
 # LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
@@ -106,6 +107,11 @@ BuildRequires:	boost-devel
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
+%if 0%{?suse_version}
+BuildRequires:	python-Cython
+%else
+BuildRequires:	Cython
+%endif
 BuildRequires:	gdbm
 BuildRequires:	hdparm
 BuildRequires:	leveldb-devel > 1.2
@@ -121,6 +127,7 @@ BuildRequires:	parted
 BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-devel
 BuildRequires:	python-nose
 BuildRequires:	python-requests
 BuildRequires:	python-virtualenv
@@ -138,6 +145,7 @@ BuildRequires:	yasm
 %if 0%{?_with_systemd}
 BuildRequires:  pkgconfig(systemd)
 BuildRequires:	systemd-rpm-macros
+BuildRequires:	systemd
 %{?systemd_requires}
 %endif
 PreReq:		%fillup_prereq
@@ -253,6 +261,15 @@ Requires:	librbd1 = %{epoch}:%{version}-%{release}
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
+%package -n rbd-nbd
+Summary:	Ceph RBD client base on NBD
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-nbd
+NBD based client to map Ceph rbd images to local device
+
 %package radosgw
 Summary:	Rados REST gateway
 Group:		Development/Libraries
@@ -628,6 +645,10 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
   install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
   install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-osd.target $RPM_BUILD_ROOT%{_unitdir}/ceph-osd.target
+  install -m 0644 -D systemd/ceph-mon.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mon.target
+  install -m 0644 -D systemd/ceph-mds.target $RPM_BUILD_ROOT%{_unitdir}/ceph-mds.target
+  install -m 0644 -D systemd/ceph-radosgw.target $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw.target
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
@@ -786,6 +807,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_unitdir}/ceph-radosgw at .service
 %{_unitdir}/ceph-disk at .service
 %{_unitdir}/ceph.target
+%{_unitdir}/ceph-osd.target
+%{_unitdir}/ceph-mon.target
+%{_unitdir}/ceph-mds.target
+%{_unitdir}/ceph-radosgw.target
 %else
 %{_initrddir}/ceph
 %endif
@@ -939,7 +964,7 @@ exit 0
 
 %post -n ceph-common
 %if 0%{?_with_systemd}
-systemd-tmpfiles --create --prefix=/run/ceph
+%tmpfiles_create %{_tmpfilesdir}/ceph-common.conf
 %endif
 
 %postun -n ceph-common
@@ -967,6 +992,12 @@ fi
 %{_mandir}/man8/rbd-fuse.8*
 
 #################################################################################
+%files -n rbd-nbd
+%defattr(-,root,root,-)
+%{_bindir}/rbd-nbd
+%{_mandir}/man8/rbd-nbd.8*
+
+#################################################################################
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
@@ -1057,6 +1088,7 @@ fi
 %{_includedir}/rados/librados.h
 %{_includedir}/rados/librados.hpp
 %{_includedir}/rados/buffer.h
+%{_includedir}/rados/buffer_fwd.h
 %{_includedir}/rados/page.h
 %{_includedir}/rados/crc32c.h
 %{_includedir}/rados/rados_types.h
@@ -1122,7 +1154,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 #################################################################################
 %files -n python-rbd
 %defattr(-,root,root,-)
-%{python_sitelib}/rbd.py*
+%{python_sitearch}/rbd.so
+%{python_sitearch}/rbd-*.egg-info
 
 #################################################################################
 %files -n libcephfs1
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index 9a63581..e928cea 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -446,6 +446,40 @@ TEST_F(PGLogTest, rewind_divergent_log) {
     EXPECT_TRUE(dirty_info);
     EXPECT_TRUE(dirty_big_info);
   }
+
+  // Test for 13965
+  {
+    clear();
+
+    ObjectStore::Transaction t;
+    list<hobject_t> remove_snap;
+    pg_info_t info;
+    info.log_tail = log.tail = eversion_t(1, 5);
+    info.last_update = eversion_t(1, 6);
+    bool dirty_info = false;
+    bool dirty_big_info = false;
+
+    {
+      pg_log_entry_t e;
+      e.mod_desc.mark_unrollbackable();
+      e.version = eversion_t(1, 5);
+      e.soid.set_hash(0x9);
+      add(e);
+    }
+    {
+      pg_log_entry_t e;
+      e.mod_desc.mark_unrollbackable();
+      e.version = eversion_t(1, 6);
+      e.soid.set_hash(0x10);
+      add(e);
+    }
+    TestHandler h(remove_snap);
+    trim_rollback_info(eversion_t(1, 6), &h);
+    rewind_divergent_log(t, eversion_t(1, 5), info, &h,
+			 dirty_info, dirty_big_info);
+    pg_log_t log;
+    claim_log_and_clear_rollback_info(log, &h);
+  }
 }
 
 TEST_F(PGLogTest, merge_old_entry) {
diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh
index ca32997..2b22ab9 100755
--- a/src/test/osd/osd-scrub-repair.sh
+++ b/src/test/osd/osd-scrub-repair.sh
@@ -159,25 +159,24 @@ function TEST_auto_repair_erasure_coded() {
             --osd-scrub-min-interval=5 \
             --osd-scrub-interval-randomize-ratio=0
     done
-    wait_for_clean || return 1
 
     # Create an EC pool
     ceph osd erasure-code-profile set myprofile \
         k=2 m=1 ruleset-failure-domain=osd || return 1
     ceph osd pool create $poolname 8 8 erasure myprofile || return 1
-    wait_for_clean || return 1
 
     # Put an object
     local payload=ABCDEF
     echo $payload > $dir/ORIGINAL
     rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1
+    wait_for_clean || return 1
 
     # Remove the object from one shard physically
     objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1
-
-    # Give some time for auto repair
-    sleep 20
-
+    # Wait for auto repair
+    local pgid=$(get_pg $poolname SOMETHING)
+    wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)"
+    wait_for_clean || return 1
     # Verify - the file should be back
     objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1
     rados --pool $poolname get SOMETHING $dir/COPY || return 1
diff --git a/src/test/osd/osd-scrub-snaps.sh b/src/test/osd/osd-scrub-snaps.sh
index 9819c04..bf59570 100755
--- a/src/test/osd/osd-scrub-snaps.sh
+++ b/src/test/osd/osd-scrub-snaps.sh
@@ -92,8 +92,7 @@ function TEST_scrub_snaps() {
     rados -p $poolname rm obj4
     rados -p $poolname rm obj2
 
-    kill_daemons $dir KILL osd || return 1
-    sleep 5
+    kill_daemons $dir TERM osd || return 1
 
     # Don't need to ceph_objectstore_tool function because osd stopped
 
@@ -150,17 +149,18 @@ function TEST_scrub_snaps() {
     run_osd $dir 0 || return 1
     wait_for_clean || return 1
 
-    sleep 5
-    ceph pg scrub ${poolid}.0
-    timeout 30 ceph -w
+    local pgid="${poolid}.0"
+    if ! pg_scrub "$pgid" ; then
+        cat $dir/osd.0.log
+        return 1
+    fi
+    grep 'log_channel' $dir/osd.0.log
 
     for i in `seq 1 7`
     do
         rados -p $poolname rmsnap snap$i
     done
 
-    sleep 10
-
     ERRORS=0
 
     pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index b69a88a..1652ac5 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -1388,6 +1388,65 @@ TEST(ghobject_t, cmp) {
   ASSERT_TRUE(cmp_bitwise(o, sep) > 0);
 }
 
+TEST(pool_opts_t, invalid_opt) {
+  EXPECT_FALSE(pool_opts_t::is_opt_name("INVALID_OPT"));
+  EXPECT_THROW(pool_opts_t::get_opt_desc("INVALID_OPT"), FailedAssertion);
+}
+
+TEST(pool_opts_t, scrub_min_interval) {
+  EXPECT_TRUE(pool_opts_t::is_opt_name("scrub_min_interval"));
+  EXPECT_EQ(pool_opts_t::get_opt_desc("scrub_min_interval"),
+            pool_opts_t::opt_desc_t(pool_opts_t::SCRUB_MIN_INTERVAL,
+                                    pool_opts_t::DOUBLE));
+
+  pool_opts_t opts;
+  EXPECT_FALSE(opts.is_set(pool_opts_t::SCRUB_MIN_INTERVAL));
+  EXPECT_THROW(opts.get(pool_opts_t::SCRUB_MIN_INTERVAL), FailedAssertion);
+  double val;
+  EXPECT_FALSE(opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &val));
+  opts.set(pool_opts_t::SCRUB_MIN_INTERVAL, static_cast<double>(2015));
+  EXPECT_TRUE(opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &val));
+  EXPECT_EQ(val, 2015);
+  opts.unset(pool_opts_t::SCRUB_MIN_INTERVAL);
+  EXPECT_FALSE(opts.is_set(pool_opts_t::SCRUB_MIN_INTERVAL));
+}
+
+TEST(pool_opts_t, scrub_max_interval) {
+  EXPECT_TRUE(pool_opts_t::is_opt_name("scrub_max_interval"));
+  EXPECT_EQ(pool_opts_t::get_opt_desc("scrub_max_interval"),
+            pool_opts_t::opt_desc_t(pool_opts_t::SCRUB_MAX_INTERVAL,
+                                    pool_opts_t::DOUBLE));
+
+  pool_opts_t opts;
+  EXPECT_FALSE(opts.is_set(pool_opts_t::SCRUB_MAX_INTERVAL));
+  EXPECT_THROW(opts.get(pool_opts_t::SCRUB_MAX_INTERVAL), FailedAssertion);
+  double val;
+  EXPECT_FALSE(opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &val));
+  opts.set(pool_opts_t::SCRUB_MAX_INTERVAL, static_cast<double>(2015));
+  EXPECT_TRUE(opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &val));
+  EXPECT_EQ(val, 2015);
+  opts.unset(pool_opts_t::SCRUB_MAX_INTERVAL);
+  EXPECT_FALSE(opts.is_set(pool_opts_t::SCRUB_MAX_INTERVAL));
+}
+
+TEST(pool_opts_t, deep_scrub_interval) {
+  EXPECT_TRUE(pool_opts_t::is_opt_name("deep_scrub_interval"));
+  EXPECT_EQ(pool_opts_t::get_opt_desc("deep_scrub_interval"),
+            pool_opts_t::opt_desc_t(pool_opts_t::DEEP_SCRUB_INTERVAL,
+                                    pool_opts_t::DOUBLE));
+
+  pool_opts_t opts;
+  EXPECT_FALSE(opts.is_set(pool_opts_t::DEEP_SCRUB_INTERVAL));
+  EXPECT_THROW(opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL), FailedAssertion);
+  double val;
+  EXPECT_FALSE(opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &val));
+  opts.set(pool_opts_t::DEEP_SCRUB_INTERVAL, static_cast<double>(2015));
+  EXPECT_TRUE(opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &val));
+  EXPECT_EQ(val, 2015);
+  opts.unset(pool_opts_t::DEEP_SCRUB_INTERVAL);
+  EXPECT_FALSE(opts.is_set(pool_opts_t::DEEP_SCRUB_INTERVAL));
+}
+
 /*
  * Local Variables:
  * compile-command: "cd ../.. ;
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index fd44977..53d420f 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -1032,7 +1032,10 @@ class TestOSD(TestArgparse):
 
     def test_pool_get(self):
         for var in ('size', 'min_size', 'crash_replay_interval',
-                    'pg_num', 'pgp_num', 'crush_ruleset', 'auid', 'fast_read'):
+                    'pg_num', 'pgp_num', 'crush_ruleset', 'auid', 'fast_read',
+                    'scrub_min_interval', 'scrub_max_interval',
+                    'deep_scrub_interval', 'recovery_priority',
+                    'recovery_op_priority'):
             self.assert_valid_command(['osd', 'pool', 'get', 'poolname', var])
         assert_equal({}, validate_command(sigdict, ['osd', 'pool']))
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
@@ -1049,7 +1052,10 @@ class TestOSD(TestArgparse):
     def test_pool_set(self):
         for var in ('size', 'min_size', 'crash_replay_interval',
                     'pg_num', 'pgp_num', 'crush_ruleset',
-                    'hashpspool', 'auid', 'fast_read'):
+                    'hashpspool', 'auid', 'fast_read',
+                    'scrub_min_interval', 'scrub_max_interval',
+                    'deep_scrub_interval', 'recovery_priority',
+                    'recovery_op_priority'):
             self.assert_valid_command(['osd', 'pool',
                                        'set', 'poolname', var, 'value'])
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
diff --git a/src/test/rgw/test_rgw_manifest.cc b/src/test/rgw/test_rgw_manifest.cc
index 4fb8069..086e7d2 100644
--- a/src/test/rgw/test_rgw_manifest.cc
+++ b/src/test/rgw/test_rgw_manifest.cc
@@ -29,9 +29,9 @@
 #endif
 using namespace std;
 
-static void init_bucket(rgw_bucket *bucket, const char *name)
+static void init_bucket(rgw_bucket *bucket, const char *ten, const char *name)
 {
-  *bucket = rgw_bucket(name, ".data-pool", ".index-pool", "marker.", "bucket-id", NULL);
+  *bucket = rgw_bucket(ten, name, ".data-pool", ".index-pool", "marker.", "bucket-id", NULL);
 }
 
 void append_head(list<rgw_obj> *objs, rgw_obj& head)
@@ -62,7 +62,7 @@ static void gen_obj(uint64_t obj_size, uint64_t head_max_size, uint64_t stripe_s
 {
   manifest->set_trivial_rule(head_max_size, stripe_size);
 
-  init_bucket(bucket, "buck");
+  init_bucket(bucket, "", "buck");
 
   *head = rgw_obj(*bucket, "oid");
   gen->create_begin(g_ceph_context, manifest, *bucket, *head);
diff --git a/src/test/rgw/test_rgw_obj.cc b/src/test/rgw/test_rgw_obj.cc
index 18696a6..79159ca 100644
--- a/src/test/rgw/test_rgw_obj.cc
+++ b/src/test/rgw/test_rgw_obj.cc
@@ -32,7 +32,7 @@ using namespace std;
 
 static void init_bucket(rgw_bucket *bucket, const char *name)
 {
-  *bucket = rgw_bucket(name, ".data-pool", ".index-pool", "marker", "bucket-id", NULL);
+  *bucket = rgw_bucket("", name, ".data-pool", ".index-pool", "marker", "bucket-id", NULL);
 }
 
 void check_parsed_correctly(rgw_obj& obj, const string& name, const string& ns, const string& instance)
diff --git a/src/test/test_stress_watch.cc b/src/test/test_stress_watch.cc
index 9e66f0e..6ddfee5 100644
--- a/src/test/test_stress_watch.cc
+++ b/src/test/test_stress_watch.cc
@@ -18,7 +18,6 @@
 
 
 using namespace librados;
-using ceph::buffer;
 using std::map;
 using std::ostringstream;
 using std::string;
diff --git a/src/test/test_subprocess.cc b/src/test/test_subprocess.cc
index 725d2a6..a38669e 100644
--- a/src/test/test_subprocess.cc
+++ b/src/test/test_subprocess.cc
@@ -51,7 +51,7 @@ TEST(SubProcess, False)
 
 TEST(SubProcess, NotFound)
 {
-  SubProcess p("NOTEXISTENTBINARY", false, false, true);
+  SubProcess p("NOTEXISTENTBINARY", SubProcess::CLOSE, SubProcess::CLOSE, SubProcess::PIPE);
   ASSERT_EQ(p.spawn(), 0);
   std::string buf;
   ASSERT_TRUE(read_from_fd(p.get_stderr(), buf));
@@ -63,7 +63,7 @@ TEST(SubProcess, NotFound)
 
 TEST(SubProcess, Echo)
 {
-  SubProcess echo("echo", false, true);
+  SubProcess echo("echo", SubProcess::CLOSE, SubProcess::PIPE);
   echo.add_cmd_args("1", "2", "3", NULL);
 
   ASSERT_EQ(echo.spawn(), 0);
@@ -77,7 +77,7 @@ TEST(SubProcess, Echo)
 
 TEST(SubProcess, Cat)
 {
-  SubProcess cat("cat", true, true, true);
+  SubProcess cat("cat", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE);
 
   ASSERT_EQ(cat.spawn(), 0);
   std::string msg("to my, trociny!");
@@ -96,7 +96,7 @@ TEST(SubProcess, Cat)
 
 TEST(SubProcess, CatDevNull)
 {
-  SubProcess cat("cat", true, true, true);
+  SubProcess cat("cat", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE);
   cat.add_cmd_arg("/dev/null");
 
   ASSERT_EQ(cat.spawn(), 0);
@@ -111,7 +111,7 @@ TEST(SubProcess, CatDevNull)
 
 TEST(SubProcess, Killed)
 {
-  SubProcessTimed cat("cat", true, true);
+  SubProcessTimed cat("cat", SubProcess::PIPE, SubProcess::PIPE);
 
   ASSERT_EQ(cat.spawn(), 0);
   cat.kill();
@@ -122,7 +122,7 @@ TEST(SubProcess, Killed)
 
 TEST(SubProcess, CatWithArgs)
 {
-  SubProcess cat("cat", true, true, true);
+  SubProcess cat("cat", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE);
   cat.add_cmd_args("/dev/stdin", "/dev/null", "/NOTEXIST", NULL);
 
   ASSERT_EQ(cat.spawn(), 0);
@@ -144,7 +144,7 @@ TEST(SubProcess, CatWithArgs)
 
 TEST(SubProcess, Subshell)
 {
-  SubProcess sh("/bin/sh", true, true, true);
+  SubProcess sh("/bin/sh", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE);
   sh.add_cmd_args("-c",
       "sleep 0; "
       "cat; "
@@ -169,7 +169,7 @@ TEST(SubProcess, Subshell)
 
 TEST(SubProcessTimed, True)
 {
-  SubProcessTimed p("true", false, false, false, 10);
+  SubProcessTimed p("true", SubProcess::CLOSE, SubProcess::CLOSE, SubProcess::CLOSE, 10);
   ASSERT_EQ(p.spawn(), 0);
   ASSERT_EQ(p.join(), 0);
   ASSERT_TRUE(p.err()[0] == '\0');
@@ -177,7 +177,7 @@ TEST(SubProcessTimed, True)
 
 TEST(SubProcessTimed, SleepNoTimeout)
 {
-  SubProcessTimed sleep("sleep", false, false, false, 0);
+  SubProcessTimed sleep("sleep", SubProcess::CLOSE, SubProcess::CLOSE, SubProcess::CLOSE, 0);
   sleep.add_cmd_arg("1");
 
   ASSERT_EQ(sleep.spawn(), 0);
@@ -187,7 +187,7 @@ TEST(SubProcessTimed, SleepNoTimeout)
 
 TEST(SubProcessTimed, Killed)
 {
-  SubProcessTimed cat("cat", true, true, true, 5);
+  SubProcessTimed cat("cat", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE, 5);
 
   ASSERT_EQ(cat.spawn(), 0);
   cat.kill();
@@ -203,7 +203,7 @@ TEST(SubProcessTimed, Killed)
 
 TEST(SubProcessTimed, SleepTimedout)
 {
-  SubProcessTimed sleep("sleep", false, false, true, 1);
+  SubProcessTimed sleep("sleep", SubProcess::CLOSE, SubProcess::CLOSE, SubProcess::PIPE, 1);
   sleep.add_cmd_arg("10");
 
   ASSERT_EQ(sleep.spawn(), 0);
@@ -218,7 +218,7 @@ TEST(SubProcessTimed, SleepTimedout)
 
 TEST(SubProcessTimed, SubshellNoTimeout)
 {
-  SubProcessTimed sh("/bin/sh", true, true, true, 0);
+  SubProcessTimed sh("/bin/sh", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE, 0);
   sh.add_cmd_args("-c", "cat >&2", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string msg("the quick brown fox jumps over the lazy dog");
@@ -238,7 +238,7 @@ TEST(SubProcessTimed, SubshellNoTimeout)
 
 TEST(SubProcessTimed, SubshellKilled)
 {
-  SubProcessTimed sh("/bin/sh", true, true, true, 10);
+  SubProcessTimed sh("/bin/sh", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE, 10);
   sh.add_cmd_args("-c", "sh -c cat", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string msg("etaoin shrdlu");
@@ -255,7 +255,7 @@ TEST(SubProcessTimed, SubshellKilled)
 
 TEST(SubProcessTimed, SubshellTimedout)
 {
-  SubProcessTimed sh("/bin/sh", true, true, true, 1, SIGTERM);
+  SubProcessTimed sh("/bin/sh", SubProcess::PIPE, SubProcess::PIPE, SubProcess::PIPE, 1, SIGTERM);
   sh.add_cmd_args("-c", "sleep 1000& cat; NEVER REACHED", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string buf;
diff --git a/src/tools/Makefile-client.am b/src/tools/Makefile-client.am
index 1764eac..72d1058 100644
--- a/src/tools/Makefile-client.am
+++ b/src/tools/Makefile-client.am
@@ -45,10 +45,13 @@ rbd_SOURCES = \
 	tools/rbd/action/Import.cc \
 	tools/rbd/action/ImportDiff.cc \
 	tools/rbd/action/Info.cc \
+	tools/rbd/action/Journal.cc \
 	tools/rbd/action/Kernel.cc \
+	tools/rbd/action/Nbd.cc \
 	tools/rbd/action/List.cc \
 	tools/rbd/action/Lock.cc \
 	tools/rbd/action/MergeDiff.cc \
+	tools/rbd/action/MirrorPool.cc \
 	tools/rbd/action/ObjectMap.cc \
 	tools/rbd/action/Remove.cc \
 	tools/rbd/action/Rename.cc \
@@ -63,10 +66,17 @@ noinst_HEADERS += \
 	tools/rbd/Shell.h \
 	tools/rbd/Utils.h
 rbd_LDADD = \
-	$(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) \
+	libjournal.la libcls_journal_client.la \
+	$(LIBKRBD) $(LIBRBD) $(LIBRBD_TYPES) $(LIBRADOS) $(CEPH_GLOBAL) \
 	$(BOOST_REGEX_LIBS) $(BOOST_PROGRAM_OPTIONS_LIBS)
 if LINUX
 bin_PROGRAMS += rbd
+
+rbd_nbd_SOURCES = tools/rbd_nbd/rbd-nbd.cc
+rbd_nbd_CXXFLAGS = $(AM_CXXFLAGS)
+rbd_nbd_LDADD = $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) $(BOOST_REGEX_LIBS)
+bin_PROGRAMS += rbd-nbd
+
 endif # LINUX
 
 endif # WITH_RBD
diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc
index 202c5df..61d545e 100644
--- a/src/tools/ceph_kvstore_tool.cc
+++ b/src/tools/ceph_kvstore_tool.cc
@@ -291,13 +291,16 @@ int main(int argc, const char *argv[])
 
     if (argc >= 7) {
       string subcmd(argv[6]);
-      string out(argv[7]);
-
       if (subcmd != "out") {
         std::cerr << "unrecognized subcmd '" << subcmd << "'"
                   << std::endl;
         return 1;
       }
+      if (argc < 8) {
+        std::cerr << "output path not specified" << std::endl;
+        return 1;
+      }
+      string out(argv[7]);
 
       if (out.empty()) {
         std::cerr << "unspecified out file" << std::endl;
diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc
index c979bcd..83ad9ce 100644
--- a/src/tools/ceph_monstore_tool.cc
+++ b/src/tools/ceph_monstore_tool.cc
@@ -24,6 +24,10 @@
 #include "include/stringify.h"
 #include "mon/MonitorDBStore.h"
 #include "mon/Paxos.h"
+#include "mon/MonMap.h"
+#include "mds/MDSMap.h"
+#include "osd/OSDMap.h"
+#include "crush/CrushCompiler.h"
 
 namespace po = boost::program_options;
 using namespace std;
@@ -194,6 +198,8 @@ void usage(const char *n, po::options_description &d)
   << "                                  (default: last committed)\n"
   << "  get mdsmap [-- options]         get mdsmap (version VER if specified)\n"
   << "                                  (default: last committed)\n"
+  << "  get crushmap [-- options]       get crushmap (version VER if specified)\n"
+  << "                                  (default: last committed)\n"
   << "  dump-keys                       dumps store keys to FILE\n"
   << "                                  (default: stdout)\n"
   << "  dump-paxos [-- options]         dump paxos transactions\n"
@@ -551,6 +557,7 @@ int main(int argc, char **argv) {
   } else if (cmd == "get") {
     unsigned v = 0;
     string outpath;
+    bool readable = false;
     string map_type;
     // visible options for this command
     po::options_description op_desc("Allowed 'get' options");
@@ -560,6 +567,8 @@ int main(int argc, char **argv) {
        "output file (default: stdout)")
       ("version,v", po::value<unsigned>(&v),
        "map version to obtain")
+      ("readable,r", po::value<bool>(&readable)->default_value(false),
+       "print the map infomation in human readable format")
       ;
     // this is going to be a positional argument; we don't want to show
     // it as an option during --help, but we do want to have it captured
@@ -587,7 +596,11 @@ int main(int argc, char **argv) {
     }
 
     if (v == 0) {
-      v = st.get(map_type, "last_committed");
+      if (map_type == "crushmap") {
+        v = st.get("osdmap", "last_committed");
+      } else {
+        v = st.get(map_type, "last_committed");
+      }
     }
 
     int fd = STDOUT_FILENO;
@@ -612,6 +625,12 @@ int main(int argc, char **argv) {
     r = 0;
     if (map_type == "osdmap") {
       r = st.get(map_type, st.combine_strings("full", v), bl);
+    } else if (map_type == "crushmap") {
+      bufferlist tmp;
+      r = st.get("osdmap", st.combine_strings("full", v), tmp);
+      OSDMap osdmap;
+      osdmap.decode(tmp);
+      osdmap.crush->encode(bl);
     } else {
       r = st.get(map_type, v, bl);
     }
@@ -620,7 +639,37 @@ int main(int argc, char **argv) {
       err = EINVAL;
       goto done;
     }
-    bl.write_fd(fd);
+
+    if (readable) {
+      stringstream ss;
+      bufferlist out;
+      if (map_type == "monmap") {
+        MonMap monmap;
+        monmap.decode(bl);
+        monmap.print(ss);
+      } else if (map_type == "osdmap") {
+        OSDMap osdmap;
+        osdmap.decode(bl);
+        osdmap.print(ss);
+      } else if (map_type == "mdsmap") {
+        MDSMap mdsmap;
+        mdsmap.decode(bl);
+        mdsmap.print(ss);
+      } else if (map_type == "crushmap") {
+        CrushWrapper cw;
+        bufferlist::iterator it = bl.begin();
+        cw.decode(it);
+        CrushCompiler cc(cw, std::cerr, 0);
+        cc.decompile(ss);
+      } else {
+        std::cerr << "This type of readable map does not exist: " << map_type << std::endl
+                  << "You can only specify[osdmap|monmap|mdsmap|crushmap]" << std::endl;
+      }
+      out.append(ss);
+      out.write_fd(fd);
+    } else {
+      bl.write_fd(fd);
+    }
 
     if (!outpath.empty()) {
       std::cout << "wrote " << map_type
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index 31ea7d4..23870cf 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -503,7 +503,7 @@ int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
     info, coll,
     past_intervals,
     pgmeta_oid,
-    true);
+    true, true);
   if (ret) cerr << "Failed to write info" << std::endl;
   t.omap_setkeys(coll, pgmeta_oid, km);
   return ret;
@@ -518,7 +518,7 @@ int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
     return ret;
   coll_t coll(info.pgid);
   map<string,bufferlist> km;
-  PGLog::write_log(t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors);
+  PGLog::write_log(t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors, true);
   t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
   return 0;
 }
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
index ce083ce..965ab52 100644
--- a/src/tools/cephfs/DataScan.cc
+++ b/src/tools/cephfs/DataScan.cc
@@ -37,6 +37,8 @@ void DataScan::usage()
     << "    --force-corrupt: overrite apparently corrupt structures\n"
     << "    --force-init: write root inodes even if they exist\n"
     << "    --force-pool: use data pool even if it is not in MDSMap\n"
+    << "\n"
+    << "  cephfs-data-scan scan_frags [--force-corrupt]\n"
     << std::endl;
 
   generic_client_usage();
@@ -52,7 +54,7 @@ bool DataScan::parse_kwarg(
   }
 
   const std::string arg(*i);
-  const std::string val(*(++i));
+  const std::string val(*(i + 1));
 
   if (arg == std::string("--output-dir")) {
     if (driver != NULL) {
@@ -60,6 +62,7 @@ bool DataScan::parse_kwarg(
       *r = -EINVAL;
       return false;
     }
+    dout(4) << "Using local file output to '" << val << "'" << dendl;
     driver = new LocalFileDriver(val, data_io);
     return true;
   } else if (arg == std::string("-n")) {
@@ -80,6 +83,10 @@ bool DataScan::parse_kwarg(
       return false;
     }
     return true;
+  } else if (arg == std::string("--filter-tag")) {
+    filter_tag = val;
+    dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
+    return true;
   } else {
     return false;
   }
@@ -156,6 +163,7 @@ int DataScan::main(const std::vector<const char*> &args)
     driver = new MetadataDriver();
     driver->set_force_corrupt(force_corrupt);
     driver->set_force_init(force_init);
+    dout(4) << "Using metadata pool output" << dendl;
   }
 
   dout(4) << "connecting to RADOS..." << dendl;
@@ -199,11 +207,31 @@ int DataScan::main(const std::vector<const char*> &args)
     }
   }
 
+  if (command == "scan_frags") {
+    int const metadata_pool_id = mdsmap->get_metadata_pool();
+
+    dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+    std::string metadata_pool_name;
+    int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+    if (r < 0) {
+      std::cerr << "Pool " << metadata_pool_id
+        << " identified in MDS map not found in RADOS!" << std::endl;
+      return r;
+    }
+
+    r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+    if (r != 0) {
+      return r;
+    }
+  }
+
   // Finally, dispatch command
   if (command == "scan_inodes") {
     return scan_inodes();
   } else if (command == "scan_extents") {
     return scan_extents();
+  } else if (command == "scan_frags") {
+    return scan_frags();
   } else if (command == "init") {
     return driver->init_roots(mdsmap->get_first_data_pool());
   } else {
@@ -447,7 +475,28 @@ int DataScan::scan_inodes()
   float progress = 0.0;
   librados::NObjectIterator i = data_io.nobjects_begin(n, m);
 #else
-  librados::NObjectIterator i = data_io.nobjects_begin();
+  librados::NObjectIterator i;
+  bool legacy_filtering = false;
+
+  bufferlist filter_bl;
+  ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
+
+  // try/catch to deal with older OSDs that don't support
+  // the cephfs pgls filtering mode
+  try {
+    i = data_io.nobjects_begin(filter_bl);
+    dout(4) << "OSDs accepted cephfs object filtering" << dendl;
+  } catch (const std::runtime_error &e) {
+    // A little unfriendly, librados raises std::runtime_error
+    // on pretty much any unhandled I/O return value, such as
+    // the OSD saying -EINVAL because of our use of a filter
+    // mode that it doesn't know about.
+    std::cerr << "OSDs do not support cephfs object filtering: using "
+                 "(slower) fallback mode" << std::endl;
+    legacy_filtering = true;
+    i = data_io.nobjects_begin();
+  }
+
 #endif
   librados::NObjectIterator i_end = data_io.nobjects_end();
 
@@ -484,10 +533,38 @@ int DataScan::scan_inodes()
       continue;
     }
 
-    // We are only interested in 0th objects during this phase: we touched
-    // the other objects during scan_extents
-    if (obj_name_offset != 0) {
-      continue;
+    if (legacy_filtering) {
+      dout(20) << "Applying filter to " << oid << dendl;
+
+      // We are only interested in 0th objects during this phase: we touched
+      // the other objects during scan_extents
+      if (obj_name_offset != 0) {
+        dout(20) << "Non-zeroth object" << dendl;
+        continue;
+      }
+
+      bufferlist scrub_tag_bl;
+      int r = data_io.getxattr(oid, "scrub_tag", scrub_tag_bl);
+      if (r >= 0) {
+        std::string read_tag;
+        bufferlist::iterator q = scrub_tag_bl.begin();
+	try {
+	  ::decode(read_tag, q);
+	  if (read_tag == filter_tag) {
+	    dout(20) << "skipping " << oid << " because it has the filter_tag"
+		     << dendl;
+	    continue;
+	  }
+	} catch (const buffer::error &err) {
+	}
+	dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
+      } else {
+        dout(20) << "no tag read (" << r << ")" << dendl;
+      }
+
+    } else {
+      assert(obj_name_offset == 0);
+      dout(20) << "OSD matched oid " << oid << dendl;
     }
 
     AccumulateResult accum_res;
@@ -495,8 +572,12 @@ int DataScan::scan_inodes()
     ceph_file_layout loaded_layout = g_default_file_layout;
     int r = ClsCephFSClient::fetch_inode_accumulate_result(
         data_io, oid, &backtrace, &loaded_layout, &accum_res);
-    
-    if (r < 0) {
+
+    if (r == -EINVAL) {
+      dout(4) << "Accumulated metadata missing from '"
+              << oid << ", did you run scan_extents?" << dendl;
+      continue;
+    } else  if (r < 0) {
       dout(4) << "Unexpected error loading accumulated metadata from '"
               << oid << "': " << cpp_strerror(r) << dendl;
       // FIXME: this creates situation where if a client has a corrupt
@@ -508,7 +589,6 @@ int DataScan::scan_inodes()
 
     const time_t file_mtime = accum_res.max_mtime;
     uint64_t file_size = 0;
-    uint32_t chunk_size = g_default_file_layout.fl_object_size;
     bool have_backtrace = !(backtrace.ancestors.empty());
 
     // This is the layout we will use for injection, populated either
@@ -516,8 +596,9 @@ int DataScan::scan_inodes()
     ceph_file_layout guessed_layout;
     guessed_layout.fl_pg_pool = data_pool_id;
 
-    // Calculate file_size, guess chunk_size
+    // Calculate file_size, guess the layout
     if (accum_res.ceiling_obj_index > 0) {
+      uint32_t chunk_size = g_default_file_layout.fl_object_size;
       // When there are multiple objects, the largest object probably
       // indicates the chunk size.  But not necessarily, because files
       // can be sparse.  Only make this assumption if size seen
@@ -531,9 +612,10 @@ int DataScan::scan_inodes()
         guessed_layout.fl_object_size = chunk_size;
         guessed_layout.fl_stripe_unit = chunk_size;
         guessed_layout.fl_stripe_count = 1;
-      } else if (loaded_layout.fl_object_size < accum_res.max_obj_size) {
+      } else if (!ceph_file_layout_is_valid(&loaded_layout) ||
+          loaded_layout.fl_object_size < accum_res.max_obj_size) {
         // If the max size seen exceeds what the stashed layout claims, then
-        // disbelieve it.  Guess instead.
+        // disbelieve it.  Guess instead.  Same for invalid layouts on disk.
         dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
                 << std::dec << ", ignoring in favour of best guess" << dendl;
         guessed_layout.fl_object_size = chunk_size;
@@ -616,8 +698,215 @@ int DataScan::scan_inodes()
       }
     } else {
       file_size = accum_res.ceiling_obj_size;
+      if (loaded_layout.fl_pg_pool == uint32_t(-1)
+          || loaded_layout.fl_object_size < accum_res.max_obj_size) {
+        // No layout loaded, or inconsistent layout, use default
+        guessed_layout = g_default_file_layout;
+        guessed_layout.fl_pg_pool = data_pool_id;
+      } else {
+        guessed_layout = loaded_layout;
+      }
+    }
+
+    // Santity checking backtrace ino against object name
+    if (have_backtrace && backtrace.ino != obj_name_ino) {
+      dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+        << " doesn't match object name ino 0x" << obj_name_ino
+        << std::dec << dendl;
+      have_backtrace = false;
+    }
+
+    InodeStore dentry;
+    build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
+
+    // Inject inode to the metadata pool
+    if (have_backtrace) {
+      inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+      if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+        /* Special case for strays: even if we have a good backtrace,
+         * don't put it in the stray dir, because while that would technically
+         * give it linkage it would still be invisible to the user */
+        r = driver->inject_lost_and_found(obj_name_ino, dentry);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      } else {
+        /* Happy case: we will inject a named dentry for this inode */
+        r = driver->inject_with_backtrace(backtrace, dentry);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      }
+    } else {
+      /* Backtrace-less case: we will inject a lost+found dentry */
+      r = driver->inject_lost_and_found(
+          obj_name_ino, dentry);
+      if (r < 0) {
+        dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+          << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+        if (r == -EINVAL) {
+          dout(4) << "Use --force-corrupt to overwrite structures that "
+                     "appear to be corrupt" << dendl;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+int DataScan::scan_frags()
+{
+  librados::NObjectIterator i;
+  bool legacy_filtering = false;
+
+  bufferlist filter_bl;
+  ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
+
+  // try/catch to deal with older OSDs that don't support
+  // the cephfs pgls filtering mode
+  try {
+    i = metadata_io.nobjects_begin(filter_bl);
+    dout(4) << "OSDs accepted cephfs object filtering" << dendl;
+  } catch (const std::runtime_error &e) {
+    // A little unfriendly, librados raises std::runtime_error
+    // on pretty much any unhandled I/O return value, such as
+    // the OSD saying -EINVAL because of our use of a filter
+    // mode that it doesn't know about.
+    std::cerr << "OSDs do not support cephfs object filtering: using "
+                 "(slower) fallback mode" << std::endl;
+    legacy_filtering = true;
+    i = metadata_io.nobjects_begin();
+  }
+
+  librados::NObjectIterator i_end = metadata_io.nobjects_end();
+
+  bool roots_present;
+  int r = driver->check_roots(&roots_present);
+  if (r != 0) {
+    derr << "Unexpected error checking roots: '"
+      << cpp_strerror(r) << "'" << dendl;
+    return r;
+  }
+
+  if (!roots_present) {
+    std::cerr << "Some or all system inodes are absent.  Run 'init' from "
+      "one node before running 'scan_inodes'" << std::endl;
+    return -EIO;
+  }
+
+  for (; i != i_end; ++i) {
+    const std::string oid = i->get_oid();
+    uint64_t obj_name_ino = 0;
+    uint64_t obj_name_offset = 0;
+    r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+    if (r != 0) {
+      dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+      continue;
+    }
+
+    if (obj_name_ino < (1ULL << 40)) {
+      // FIXME: we're skipping stray dirs here: if they're
+      // orphaned then we should be resetting them some other
+      // way
+      dout(10) << "Skipping system ino " << obj_name_ino << dendl;
+      continue;
+    }
+
+    if (legacy_filtering) {
+      dout(20) << "Applying filter to " << oid << dendl;
+
+      // We are only interested in 0th objects during this phase: we touched
+      // the other objects during scan_extents
+      if (obj_name_offset != 0) {
+        dout(20) << "Non-zeroth object" << dendl;
+        continue;
+      }
+
+      bufferlist scrub_tag_bl;
+      int r = metadata_io.getxattr(oid, "scrub_tag", scrub_tag_bl);
+      if (r >= 0) {
+        std::string read_tag;
+        bufferlist::iterator q = scrub_tag_bl.begin();
+        ::decode(read_tag, q);
+        if (read_tag == filter_tag) {
+          dout(20) << "skipping " << oid << " because it has the filter_tag"
+                   << dendl;
+          continue;
+        } else {
+          dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
+        }
+      } else {
+        dout(20) << "no tag read (" << r << ")" << dendl;
+      }
+
+    } else {
+      assert(obj_name_offset == 0);
+      dout(20) << "OSD matched oid " << oid << dendl;
+    }
+
+    AccumulateResult accum_res;
+    inode_backtrace_t backtrace;
+
+    // Default to inherit layout (i.e. no explicit layout on dir) which is
+    // expressed as a zeroed layout struct (see inode_t::has_layout)
+    ceph_file_layout loaded_layout;
+    memset(&loaded_layout, 0, sizeof(loaded_layout));
+
+    int parent_r = 0;
+    bufferlist parent_bl;
+    int layout_r = 0;
+    bufferlist layout_bl;
+    bufferlist op_bl;
+
+    librados::ObjectReadOperation op;
+    op.getxattr("parent", &parent_bl, &parent_r);
+    op.getxattr("layout", &layout_bl, &layout_r);
+    int r = metadata_io.operate(oid, &op, &op_bl);
+    if (r != 0 && r != -ENODATA) {
+      derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
+      continue;
+    }
+
+    if (parent_r != -ENODATA) {
+      try {
+        bufferlist::iterator q = parent_bl.begin();
+        backtrace.decode(q);
+      } catch (buffer::error &e) {
+        dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
+        if (!force_corrupt) {
+          continue;
+        } else {
+          // Treat backtrace as absent: we'll inject into lost+found
+          backtrace = inode_backtrace_t();
+        }
+      }
+    }
+
+    if (layout_r != -ENODATA) {
+      try {
+        bufferlist::iterator q = layout_bl.begin();
+        ::decode(loaded_layout, q);
+      } catch (buffer::error &e) {
+        dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl;
+        if (!force_corrupt) {
+          continue;
+        }
+      }
     }
 
+    bool have_backtrace = !(backtrace.ancestors.empty());
+
     // Santity checking backtrace ino against object name
     if (have_backtrace && backtrace.ino != obj_name_ino) {
       dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
@@ -626,6 +915,24 @@ int DataScan::scan_inodes()
       have_backtrace = false;
     }
 
+    uint64_t fnode_version = 0;
+    fnode_t fnode;
+    r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
+    if (r == -EINVAL) {
+      derr << "Corrupt fnode on " << oid << dendl;
+      if (force_corrupt) {
+        fnode.fragstat.mtime = 0;
+        fnode.fragstat.nfiles = 1;
+        fnode.fragstat.nsubdirs = 0;
+      } else {
+        continue;
+      }
+    }
+
+    InodeStore dentry;
+    build_dir_dentry(obj_name_ino, fnode.fragstat.nfiles,
+        fnode.fragstat.mtime, loaded_layout, &dentry);
+
     // Inject inode to the metadata pool
     if (have_backtrace) {
       inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
@@ -633,8 +940,7 @@ int DataScan::scan_inodes()
         /* Special case for strays: even if we have a good backtrace,
          * don't put it in the stray dir, because while that would technically
          * give it linkage it would still be invisible to the user */
-        r = driver->inject_lost_and_found(
-            obj_name_ino, file_size, file_mtime, guessed_layout);
+        r = driver->inject_lost_and_found(obj_name_ino, dentry);
         if (r < 0) {
           dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
             << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
@@ -645,8 +951,7 @@ int DataScan::scan_inodes()
         }
       } else {
         /* Happy case: we will inject a named dentry for this inode */
-        r = driver->inject_with_backtrace(
-            backtrace, file_size, file_mtime, guessed_layout);
+        r = driver->inject_with_backtrace(backtrace, dentry);
         if (r < 0) {
           dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
             << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
@@ -659,7 +964,7 @@ int DataScan::scan_inodes()
     } else {
       /* Backtrace-less case: we will inject a lost+found dentry */
       r = driver->inject_lost_and_found(
-          obj_name_ino, file_size, file_mtime, guessed_layout);
+          obj_name_ino, dentry);
       if (r < 0) {
         dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
           << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
@@ -674,7 +979,7 @@ int DataScan::scan_inodes()
   return 0;
 }
 
-int MetadataDriver::read_fnode(
+int MetadataTool::read_fnode(
     inodeno_t ino, frag_t frag, fnode_t *fnode,
     uint64_t *last_version)
 {
@@ -698,7 +1003,7 @@ int MetadataDriver::read_fnode(
   return 0;
 }
 
-int MetadataDriver::read_dentry(inodeno_t parent_ino, frag_t frag,
+int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
                 const std::string &dname, InodeStore *inode)
 {
   assert(inode != NULL);
@@ -749,8 +1054,8 @@ int MetadataDriver::read_dentry(inodeno_t parent_ino, frag_t frag,
   return 0;
 }
 
-int MetadataDriver::inject_lost_and_found(inodeno_t ino, uint64_t file_size,
-    time_t file_mtime, const ceph_file_layout &layout)
+int MetadataDriver::inject_lost_and_found(
+    inodeno_t ino, const InodeStore &dentry)
 {
   // Create lost+found if doesn't exist
   bool created = false;
@@ -764,19 +1069,16 @@ int MetadataDriver::inject_lost_and_found(inodeno_t ino, uint64_t file_size,
     if (r == -EINVAL && !force_corrupt) {
       return r;
     }
-    // Inject dentry
-    lf_ino.inode.mode = 0755 | S_IFDIR;
-    // Set nfiles to something non-zero, to fool any other code
-    // that tries to ignore 'empty' directories.  This won't be
-    // accurate, but it should avoid functional issues.
-    lf_ino.inode.dirstat.nfiles = 1;
-    lf_ino.inode.size = 1;
-    lf_ino.inode.nlink = 1;
-    lf_ino.inode.ino = CEPH_INO_LOST_AND_FOUND;
-    lf_ino.inode.version = 1;
-    lf_ino.inode.backtrace_version = 1;
-    lf_ino.inode.uid = g_conf->mds_root_ino_uid;
-    lf_ino.inode.gid = g_conf->mds_root_ino_gid;
+
+    // To have a directory not specify a layout, give it zeros (see
+    // inode_t::has_layout)
+    ceph_file_layout inherit_layout;
+    memset(&inherit_layout, 0, sizeof(inherit_layout));
+
+    // Construct LF inode
+    build_dir_dentry(CEPH_INO_LOST_AND_FOUND, 1, 0, inherit_layout, &lf_ino);
+
+    // Inject link to LF inode in the root dir
     r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
     if (r < 0) {
       return r;
@@ -796,31 +1098,12 @@ int MetadataDriver::inject_lost_and_found(inodeno_t ino, uint64_t file_size,
   }
 
   InodeStore recovered_ino;
-  recovered_ino.inode.mode = 0500 | S_IFREG;
-  recovered_ino.inode.size = file_size;
-  recovered_ino.inode.max_size_ever = file_size;
-  recovered_ino.inode.mtime.tv.tv_sec = file_mtime;
-  recovered_ino.inode.atime.tv.tv_sec = file_mtime;
-  recovered_ino.inode.ctime.tv.tv_sec = file_mtime;
-
-  recovered_ino.inode.layout = layout;
 
-  recovered_ino.inode.truncate_seq = 1;
-  recovered_ino.inode.truncate_size = -1ull;
-
-  recovered_ino.inode.inline_data.version = CEPH_INLINE_NONE;
-
-  recovered_ino.inode.nlink = 1;
-  recovered_ino.inode.ino = ino;
-  recovered_ino.inode.version = 1;
-  recovered_ino.inode.backtrace_version = 1;
-  recovered_ino.inode.uid = g_conf->mds_root_ino_uid;
-  recovered_ino.inode.gid = g_conf->mds_root_ino_gid;
 
   const std::string dname = lost_found_dname(ino);
 
   // Write dentry into lost+found dirfrag
-  return inject_linkage(lf_ino.inode.ino, dname, frag_t(), recovered_ino);
+  return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry);
 }
 
 
@@ -932,8 +1215,7 @@ int MetadataDriver::get_frag_of(
 
 
 int MetadataDriver::inject_with_backtrace(
-    const inode_backtrace_t &backtrace, uint64_t file_size, time_t file_mtime,
-    const ceph_file_layout &layout)
+    const inode_backtrace_t &backtrace, const InodeStore &dentry)
     
 {
 
@@ -1042,53 +1324,42 @@ int MetadataDriver::inject_with_backtrace(
           << dname << " already exists but points to 0x"
           << std::hex << existing_dentry.inode.ino << std::dec << dendl;
         // Fall back to lost+found!
-        return inject_lost_and_found(backtrace.ino, file_size, file_mtime,
-            layout);
+        return inject_lost_and_found(backtrace.ino, dentry);
       }
     }
 
     // Inject linkage
     // ==============
+
     if (write_dentry) {
-      InodeStore dentry;
       if (i == backtrace.ancestors.begin()) {
-        // This is the linkage for a file
-        dentry.inode.mode = 0500 | S_IFREG;
+        // This is the linkage for the file of interest
         dout(10) << "Linking inode 0x" << std::hex << ino
           << " at 0x" << parent_ino << "/" << dname << std::dec
-          << " with size=" << file_size << " bytes" << dendl;
+          << " with size=" << dentry.inode.size << " bytes" << dendl;
 
-        // The file size and mtime we learned by scanning globally
-        dentry.inode.size = file_size;
-        dentry.inode.max_size_ever = file_size;
-        dentry.inode.mtime.tv.tv_sec = file_mtime;
-        dentry.inode.atime.tv.tv_sec = file_mtime;
-        dentry.inode.ctime.tv.tv_sec = file_mtime;
-
-        dentry.inode.layout = layout;
-
-        dentry.inode.truncate_seq = 1;
-        dentry.inode.truncate_size = -1ull;
-
-        dentry.inode.inline_data.version = CEPH_INLINE_NONE;
+        r = inject_linkage(parent_ino, dname, fragment, dentry);
       } else {
-        // This is the linkage for a directory
-        dentry.inode.mode = 0755 | S_IFDIR;
+        // This is the linkage for an ancestor directory
+        InodeStore ancestor_dentry;
+        ancestor_dentry.inode.mode = 0755 | S_IFDIR;
 
         // Set nfiles to something non-zero, to fool any other code
         // that tries to ignore 'empty' directories.  This won't be
         // accurate, but it should avoid functional issues.
-        dentry.inode.dirstat.nfiles = 1;
-        dentry.inode.size = 1;
 
+        ancestor_dentry.inode.dirstat.nfiles = 1;
+        ancestor_dentry.inode.size = 1;
+
+        ancestor_dentry.inode.nlink = 1;
+        ancestor_dentry.inode.ino = ino;
+        ancestor_dentry.inode.uid = g_conf->mds_root_ino_uid;
+        ancestor_dentry.inode.gid = g_conf->mds_root_ino_gid;
+        ancestor_dentry.inode.version = 1;
+        ancestor_dentry.inode.backtrace_version = 1;
+        r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
       }
-      dentry.inode.nlink = 1;
-      dentry.inode.ino = ino;
-      dentry.inode.uid = g_conf->mds_root_ino_uid;
-      dentry.inode.gid = g_conf->mds_root_ino_gid;
-      dentry.inode.version = 1;
-      dentry.inode.backtrace_version = 1;
-      r = inject_linkage(parent_ino, dname, fragment, dentry);
+
       if (r < 0) {
         return r;
       }
@@ -1287,9 +1558,7 @@ int LocalFileDriver::inject_data(
 
 int LocalFileDriver::inject_with_backtrace(
     const inode_backtrace_t &bt,
-    uint64_t size,
-    time_t mtime,
-    const ceph_file_layout &layout)
+    const InodeStore &dentry)
 {
   std::string path_builder = path;
 
@@ -1307,7 +1576,7 @@ int LocalFileDriver::inject_with_backtrace(
     if (is_file) {
       // FIXME: inject_data won't cope with interesting (i.e. striped)
       // layouts (need a librados-compatible Filer to read these)
-      inject_data(path_builder, size, layout.fl_object_size, bt.ino);
+      inject_data(path_builder, dentry.inode.size, dentry.inode.layout.fl_object_size, bt.ino);
     } else {
       int r = mkdir(path_builder.c_str(), 0755);
       if (r != 0 && r != -EPERM) {
@@ -1323,9 +1592,7 @@ int LocalFileDriver::inject_with_backtrace(
 
 int LocalFileDriver::inject_lost_and_found(
     inodeno_t ino,
-    uint64_t size,
-    time_t mtime,
-    const ceph_file_layout &layout)
+    const InodeStore &dentry)
 {
   std::string lf_path = path + "/lost+found";
   int r = mkdir(lf_path.c_str(), 0755);
@@ -1336,7 +1603,7 @@ int LocalFileDriver::inject_lost_and_found(
   }
   
   std::string file_path = lf_path + "/" + lost_found_dname(ino);
-  return inject_data(file_path, size, layout.fl_object_size, ino);
+  return inject_data(file_path, dentry.inode.size, dentry.inode.layout.fl_object_size, ino);
 }
 
 int LocalFileDriver::init_roots(int64_t data_pool_id)
@@ -1374,3 +1641,60 @@ int LocalFileDriver::check_roots(bool *result)
   return 0;
 }
 
+void MetadataTool::build_file_dentry(
+    inodeno_t ino, uint64_t file_size, time_t file_mtime,
+    const ceph_file_layout &layout, InodeStore *out)
+{
+  assert(out != NULL);
+
+  out->inode.mode = 0500 | S_IFREG;
+  out->inode.size = file_size;
+  out->inode.max_size_ever = file_size;
+  out->inode.mtime.tv.tv_sec = file_mtime;
+  out->inode.atime.tv.tv_sec = file_mtime;
+  out->inode.ctime.tv.tv_sec = file_mtime;
+
+  out->inode.layout = layout;
+
+  out->inode.truncate_seq = 1;
+  out->inode.truncate_size = -1ull;
+
+  out->inode.inline_data.version = CEPH_INLINE_NONE;
+
+  out->inode.nlink = 1;
+  out->inode.ino = ino;
+  out->inode.version = 1;
+  out->inode.backtrace_version = 1;
+  out->inode.uid = g_conf->mds_root_ino_uid;
+  out->inode.gid = g_conf->mds_root_ino_gid;
+}
+
+void MetadataTool::build_dir_dentry(
+    inodeno_t ino, uint64_t nfiles,
+    time_t mtime, const ceph_file_layout &layout, InodeStore *out)
+{
+  assert(out != NULL);
+
+  out->inode.mode = 0755 | S_IFDIR;
+  out->inode.size = nfiles;
+  out->inode.dirstat.nfiles = nfiles;
+  out->inode.max_size_ever = nfiles;
+  out->inode.mtime.tv.tv_sec = mtime;
+  out->inode.atime.tv.tv_sec = mtime;
+  out->inode.ctime.tv.tv_sec = mtime;
+
+  out->inode.layout = layout;
+
+  out->inode.truncate_seq = 1;
+  out->inode.truncate_size = -1ull;
+
+  out->inode.inline_data.version = CEPH_INLINE_NONE;
+
+  out->inode.nlink = 1;
+  out->inode.ino = ino;
+  out->inode.version = 1;
+  out->inode.backtrace_version = 1;
+  out->inode.uid = g_conf->mds_root_ino_uid;
+  out->inode.gid = g_conf->mds_root_ino_gid;
+}
+
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
index 252e6e3..cc9e39e 100644
--- a/src/tools/cephfs/DataScan.h
+++ b/src/tools/cephfs/DataScan.h
@@ -47,9 +47,7 @@ class RecoveryDriver {
      */
     virtual int inject_with_backtrace(
         const inode_backtrace_t &bt,
-        uint64_t size,
-        time_t mtime,
-        const ceph_file_layout &layout) = 0;
+        const InodeStore &dentry) = 0;
 
     /**
      * Inject an inode + dentry into the lost+found directory,
@@ -57,9 +55,7 @@ class RecoveryDriver {
      */
     virtual int inject_lost_and_found(
         inodeno_t ino,
-        uint64_t size,
-        time_t mtime,
-        const ceph_file_layout &layout) = 0;
+        const InodeStore &dentry) = 0;
 
     /**
      * Create any missing roots (i.e. mydir, strays, root inode)
@@ -120,15 +116,11 @@ class LocalFileDriver : public RecoveryDriver
 
     int inject_with_backtrace(
         const inode_backtrace_t &bt,
-        uint64_t size,
-        time_t mtime,
-        ceph_file_layout const &layout);
+        const InodeStore &dentry);
 
     int inject_lost_and_found(
         inodeno_t ino,
-        uint64_t size,
-        time_t mtime,
-        ceph_file_layout const &layout);
+        const InodeStore &dentry);
 
     int init_roots(int64_t data_pool_id);
 
@@ -136,14 +128,51 @@ class LocalFileDriver : public RecoveryDriver
 };
 
 /**
- * A class that knows how to manipulate CephFS metadata pools
+ * A class that knows how to work with objects in a CephFS
+ * metadata pool.
  */
-class MetadataDriver : public RecoveryDriver
+class MetadataTool
 {
   protected:
 
-    librados::IoCtx metadata_io;
+  librados::IoCtx metadata_io;
+
+  /**
+   * Construct a synthetic InodeStore for a normal file
+   */
+  void build_file_dentry(
+    inodeno_t ino, uint64_t file_size, time_t file_mtime,
+    const ceph_file_layout &layout,
+    InodeStore *out);
+
+  /**
+   * Construct a synthetic InodeStore for a directory
+   */
+  void build_dir_dentry(
+    inodeno_t ino, uint64_t nfiles,
+    time_t mtime,
+    const ceph_file_layout &layout,
+    InodeStore *out);
+
+  /**
+   * Try and read an fnode from a dirfrag
+   */
+  int read_fnode(inodeno_t ino, frag_t frag,
+                 fnode_t *fnode, uint64_t *read_version);
+
+  /**
+   * Try and read a dentry from a dirfrag
+   */
+  int read_dentry(inodeno_t parent_ino, frag_t frag,
+                  const std::string &dname, InodeStore *inode);
+};
 
+/**
+ * A class that knows how to manipulate CephFS metadata pools
+ */
+class MetadataDriver : public RecoveryDriver, public MetadataTool
+{
+  protected:
     /**
      * Create a .inode object, i.e. root or mydir
      */
@@ -154,19 +183,6 @@ class MetadataDriver : public RecoveryDriver
      * trying to go ahead and inject metadata.
      */
     int root_exists(inodeno_t ino, bool *result);
-
-    /**
-     * Try and read an fnode from a dirfrag
-     */
-    int read_fnode(inodeno_t ino, frag_t frag,
-                   fnode_t *fnode, uint64_t *read_version);
-
-    /**
-     * Try and read a dentry from a dirfrag
-     */
-    int read_dentry(inodeno_t parent_ino, frag_t frag,
-                    const std::string &dname, InodeStore *inode);
-
     int find_or_create_dirfrag(
         inodeno_t ino,
         frag_t fragment,
@@ -193,27 +209,23 @@ class MetadataDriver : public RecoveryDriver
 
     int inject_with_backtrace(
         const inode_backtrace_t &bt,
-        uint64_t size,
-        time_t mtime,
-        ceph_file_layout const &layout);
+        const InodeStore &dentry);
 
     int inject_lost_and_found(
         inodeno_t ino,
-        uint64_t size,
-        time_t mtime,
-        ceph_file_layout const &layout);
+        const InodeStore &dentry);
 
     int init_roots(int64_t data_pool_id);
 
     int check_roots(bool *result);
 };
 
-class DataScan : public MDSUtility
+class DataScan : public MDSUtility, public MetadataTool
 {
   protected:
     RecoveryDriver *driver;
 
-    // IoCtx for data pool (where we scrape backtraces from)
+    // IoCtx for data pool (where we scrape file backtraces from)
     librados::IoCtx data_io;
     // Remember the data pool ID for use in layouts
     int64_t data_pool_id;
@@ -231,12 +243,20 @@ class DataScan : public MDSUtility
      */
     int scan_extents();
 
+    /**
+     * Scan metadata pool for 0th dirfrags to link orphaned
+     * directory inodes.
+     */
+    int scan_frags();
+
     // Accept pools which are not in the MDSMap
     bool force_pool;
     // Respond to decode errors by overwriting
     bool force_corrupt;
     // Overwrite root objects even if they exist
     bool force_init;
+    // Only scan inodes without this scrub tag
+    string filter_tag;
 
     /**
      * @param r set to error on valid key with invalid value
@@ -254,13 +274,16 @@ class DataScan : public MDSUtility
       const std::vector<const char*> &arg,
       std::vector<const char *>::const_iterator &i);
 
+
+
   public:
     void usage();
     int main(const std::vector<const char *> &args);
 
     DataScan()
       : driver(NULL), data_pool_id(-1), n(0), m(1),
-        force_pool(false)
+        force_pool(false), force_corrupt(false),
+        force_init(false)
     {
     }
 
diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc
index 99a0856..ebfcd0a 100644
--- a/src/tools/cephfs/TableTool.cc
+++ b/src/tools/cephfs/TableTool.cc
@@ -30,112 +30,20 @@ void TableTool::usage()
 {
   std::cout << "Usage: \n"
     << "  cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>"
+    << "  cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>"
     << std::endl;
 
   generic_client_usage();
 }
 
 
-int TableTool::main(std::vector<const char*> &argv)
-{
-  int r;
-
-  dout(10) << __func__ << dendl;
-
-  // RADOS init
-  // ==========
-  r = rados.init_with_context(g_ceph_context);
-  if (r < 0) {
-    derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
-    return r;
-  }
-
-  dout(4) << "connecting to RADOS..." << dendl;
-  rados.connect();
- 
-  int const pool_id = mdsmap->get_metadata_pool();
-  dout(4) << "resolving pool " << pool_id << dendl;
-  std::string pool_name;
-  r = rados.pool_reverse_lookup(pool_id, &pool_name);
-  if (r < 0) {
-    derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!" << dendl;
-    return r;
-  }
-
-  dout(4) << "creating IoCtx.." << dendl;
-  r = rados.ioctx_create(pool_name.c_str(), io);
-  assert(r == 0);
-
-  // Require at least 3 args <action> <table> <rank>
-  if (argv.size() < 3) {
-    usage();
-    return -EINVAL;
-  }
-
-  const std::string rank_str = std::string(argv[0]);
-  const std::string mode = std::string(argv[1]);
-  const std::string table = std::string(argv[2]);
-
-  if (rank_str == "all") {
-    rank = MDS_RANK_NONE;
-  } else {
-    std::string rank_err;
-    rank = strict_strtol(rank_str.c_str(), 10, &rank_err);
-    if (!rank_err.empty()) {
-      derr << "Bad rank '" << rank_str << "'" << dendl;
-      usage();
-    }
-  }
-
-  JSONFormatter jf(true);
-  if (mode == "reset") {
-    if (table == "session") {
-      r = apply_rank_fn(&TableTool::_reset_session_table, &jf);
-    } else if (table == "inode") {
-      r = apply_rank_fn(&TableTool::_reset_ino_table, &jf);
-    } else if (table == "snap") {
-      r = _reset_snap_table(&jf);
-    } else {
-      derr << "Invalid table '" << table << "'" << dendl;
-      usage();
-      return -EINVAL;
-    }
-  } else if (mode == "show") {
-    if (table == "session") {
-      r = apply_rank_fn(&TableTool::_show_session_table, &jf);
-    } else if (table == "inode") {
-      r = apply_rank_fn(&TableTool::_show_ino_table, &jf);
-    } else if (table == "snap") {
-      r = _show_snap_table(&jf);
-    } else {
-      derr << "Invalid table '" << table << "'" << dendl;
-      usage();
-      return -EINVAL;
-    }
-  } else {
-    derr << "Invalid mode '" << mode << "'" << dendl;
-    usage();
-    return -EINVAL;
-  }
-
-  // Subcommand should have written to formatter, flush it
-  jf.flush(std::cout);
-  std::cout << std::endl;
-  return r;
-}
-
-
-
-
-
-
 /**
  * For a function that takes an MDS rank as an argument and
  * returns an error code, execute it either on all ranks (if
  * this->rank is MDS_RANK_NONE), or on the rank specified
  * by this->rank.
  */
-int TableTool::apply_rank_fn(int (TableTool::*fptr) (mds_rank_t, Formatter*), Formatter *f)
+int TableTool::apply_rank_fn(std::function<int(mds_rank_t, Formatter *)> fptr, Formatter *f)
 {
   assert(f != NULL);
 
@@ -156,7 +64,7 @@ int TableTool::apply_rank_fn(int (TableTool::*fptr) (mds_rank_t, Formatter*), Fo
     f->open_object_section(rank_str.str().c_str());
 
     f->open_object_section("data");
-    int rank_r = (this->*fptr)(*rank_i, f);
+    int rank_r = fptr(*rank_i, f);
     f->close_section();
     r = r ? r : rank_r;
 
@@ -178,7 +86,7 @@ int TableTool::apply_rank_fn(int (TableTool::*fptr) (mds_rank_t, Formatter*), Fo
 template <typename A>
 class TableHandler
 {
-private:
+protected:
   // The RADOS object ID for the table
   std::string object_name;
 
@@ -238,10 +146,17 @@ public:
   int reset(librados::IoCtx *io)
   {
     A table_inst;
+    // Compose new (blank) table
     table_inst.set_rank(rank);
     table_inst.reset_state();
-    
-    // Compose new (blank) table
+    // Write the table out
+    return write(table_inst, io);
+  }
+
+protected:
+
+  int write(const A &table_inst, librados::IoCtx *io)
+  {
     bufferlist new_bl;
     if (mds_table) {
       version_t version = 1;
@@ -346,7 +261,6 @@ public:
     A table_inst;
     table_inst.set_rank(rank);
     table_inst.reset_state();
-
     bufferlist header_bl;
     table_inst.encode_header(&header_bl);
 
@@ -360,46 +274,147 @@ public:
   }
 };
 
-int TableTool::_show_session_table(mds_rank_t rank, Formatter *f)
+class InoTableHandler : public TableHandler<InoTable>
 {
-  return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
-}
+  public:
+  InoTableHandler(mds_rank_t r)
+    : TableHandler(r, "inotable", true)
+  {}
 
-int TableTool::_reset_session_table(mds_rank_t rank, Formatter *f)
-{
-  return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
-}
+  int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f)
+  {
+    InoTable inst;
+    inst.set_rank(rank);
+    inst.reset_state();
 
-int TableTool::_show_ino_table(mds_rank_t rank, Formatter *f)
-{
-  return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);;
-}
+    int r = 0;
+    if (inst.force_consume_to(max)) {
+      r = write(inst, io);
+    }
 
-int TableTool::_reset_ino_table(mds_rank_t rank, Formatter *f)
-{
-  return TableHandler<InoTable>(rank, "inotable", true).reset(&io);
-}
+    f->dump_int("version", inst.get_version());
+    inst.dump(f);
+
+    return r;
+  }
+};
 
-int TableTool::_show_snap_table(Formatter *f)
+
+int TableTool::main(std::vector<const char*> &argv)
 {
   int r;
 
-  f->open_object_section("show_snap_table");
-  {
-    r = TableHandler<SnapServer>(MDS_RANK_NONE, "snaptable", true).load_and_dump(&io, f);
-    f->dump_int("result", r);
+  dout(10) << __func__ << dendl;
+
+  // RADOS init
+  // ==========
+  r = rados.init_with_context(g_ceph_context);
+  if (r < 0) {
+    derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+    return r;
   }
-  f->close_section();
 
-  return r;
-}
+  dout(4) << "connecting to RADOS..." << dendl;
+  rados.connect();
+ 
+  int const pool_id = mdsmap->get_metadata_pool();
+  dout(4) << "resolving pool " << pool_id << dendl;
+  std::string pool_name;
+  r = rados.pool_reverse_lookup(pool_id, &pool_name);
+  if (r < 0) {
+    derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!" << dendl;
+    return r;
+  }
 
-int TableTool::_reset_snap_table(Formatter *f)
-{
-  int r = TableHandler<SnapServer>(MDS_RANK_NONE, "snaptable", true).reset(&io);
-  f->open_object_section("reset_snap_status");
-  f->dump_int("result", r);
-  f->close_section();
+  dout(4) << "creating IoCtx.." << dendl;
+  r = rados.ioctx_create(pool_name.c_str(), io);
+  assert(r == 0);
+
+  // Require at least 3 args <rank> <mode> <arg> [args...]
+  if (argv.size() < 3) {
+    usage();
+    return -EINVAL;
+  }
+
+  const std::string rank_str = std::string(argv[0]);
+  const std::string mode = std::string(argv[1]);
+
+  if (rank_str == "all") {
+    rank = MDS_RANK_NONE;
+  } else {
+    std::string rank_err;
+    rank = strict_strtol(rank_str.c_str(), 10, &rank_err);
+    if (!rank_err.empty()) {
+      derr << "Bad rank '" << rank_str << "'" << dendl;
+      usage();
+    }
+  }
+
+  JSONFormatter jf(true);
+  if (mode == "reset") {
+    const std::string table = std::string(argv[2]);
+    if (table == "session") {
+      r = apply_rank_fn([this](mds_rank_t rank, Formatter *f) -> int {
+            return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
+      }, &jf);
+    } else if (table == "inode") {
+      r = apply_rank_fn([this](mds_rank_t rank, Formatter *f) -> int {
+            return TableHandler<InoTable>(rank, "inotable", true).reset(&io);
+      }, &jf);
+    } else if (table == "snap") {
+      r = TableHandler<SnapServer>(MDS_RANK_NONE, "snaptable", true).reset(&io);
+      jf.open_object_section("reset_snap_status");
+      jf.dump_int("result", r);
+      jf.close_section();
+      return r;
+    } else {
+      derr << "Invalid table '" << table << "'" << dendl;
+      usage();
+      return -EINVAL;
+    }
+  } else if (mode == "show") {
+    const std::string table = std::string(argv[2]);
+    if (table == "session") {
+      r = apply_rank_fn([this](mds_rank_t rank, Formatter *f) -> int {
+        return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
+      }, &jf);
+    } else if (table == "inode") {
+      r = apply_rank_fn([this](mds_rank_t rank, Formatter *f) -> int {
+        return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);;
+      }, &jf);
+    } else if (table == "snap") {
+      jf.open_object_section("show_snap_table");
+      {
+        r = TableHandler<SnapServer>(
+            MDS_RANK_NONE, "snaptable", true).load_and_dump(&io, &jf);
+        jf.dump_int("result", r);
+      }
+      jf.close_section();
+    } else {
+      derr << "Invalid table '" << table << "'" << dendl;
+      usage();
+      return -EINVAL;
+    }
+  } else if (mode == "take_inos") {
+    const std::string ino_str = std::string(argv[2]);
+    std::string ino_err;
+    inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err);
+    if (!ino_err.empty()) {
+      derr << "Bad ino '" << ino_str << "'" << dendl;
+      return -EINVAL;
+    }
+    r = apply_rank_fn([this, ino](mds_rank_t rank, Formatter *f) -> int {
+      return InoTableHandler(rank).take_inos(&io, ino, f);
+    }, &jf);
+  } else {
+    derr << "Invalid mode '" << mode << "'" << dendl;
+    usage();
+    return -EINVAL;
+  }
+
+  // Subcommand should have written to formatter, flush it
+  jf.flush(std::cout);
+  std::cout << std::endl;
   return r;
 }
 
diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h
index 0f43a73..57705ef 100644
--- a/src/tools/cephfs/TableTool.h
+++ b/src/tools/cephfs/TableTool.h
@@ -30,16 +30,7 @@ class TableTool : public MDSUtility
     librados::Rados rados;
     librados::IoCtx io;
 
-    int apply_rank_fn(int (TableTool::*fptr) (mds_rank_t, Formatter *), Formatter *f);
-
-    int _reset_session_table(mds_rank_t rank, Formatter *f);
-    int _show_session_table(mds_rank_t rank, Formatter *f);
-
-    int _show_ino_table(mds_rank_t rank, Formatter *f);
-    int _reset_ino_table(mds_rank_t rank, Formatter *f);
-
-    int _show_snap_table(Formatter *f);
-    int _reset_snap_table(Formatter *f);
+    int apply_rank_fn(std::function<int(mds_rank_t, Formatter *)> fptr, Formatter *f);
 
   public:
     void usage();
diff --git a/src/tools/rados/RadosImport.h b/src/tools/rados/RadosImport.h
index 3ce3690..3a51663 100644
--- a/src/tools/rados/RadosImport.h
+++ b/src/tools/rados/RadosImport.h
@@ -18,7 +18,7 @@
 #include <string>
 
 #include "include/rados/librados.hpp"
-#include "include/buffer.h"
+#include "include/buffer_fwd.h"
 
 #include "tools/RadosDump.h"
 
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index 6f87c68..0506cfe 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -1638,9 +1638,9 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
     if (wildcard)
       io_ctx.set_namespace(all_nspaces);
-    bool stdout = (nargs.size() < 2) || (strcmp(nargs[1], "-") == 0);
+    bool use_stdout = (nargs.size() < 2) || (strcmp(nargs[1], "-") == 0);
     ostream *outstream;
-    if(stdout)
+    if(use_stdout)
       outstream = &cout;
     else
       outstream = new ofstream(nargs[1]);
diff --git a/src/tools/rbd/ArgumentTypes.cc b/src/tools/rbd/ArgumentTypes.cc
index f18e88d..a403f01 100644
--- a/src/tools/rbd/ArgumentTypes.cc
+++ b/src/tools/rbd/ArgumentTypes.cc
@@ -124,6 +124,37 @@ void add_snap_option(po::options_description *opt,
     (name.c_str(), po::value<std::string>(), description.c_str());
 }
 
+void add_journal_option(po::options_description *opt,
+                      ArgumentModifier modifier,
+                      const std::string &desc_suffix) {
+  std::string name = JOURNAL_NAME;
+  std::string description = "journal name";
+  switch (modifier) {
+  case ARGUMENT_MODIFIER_NONE:
+    break;
+  case ARGUMENT_MODIFIER_SOURCE:
+    description = "source " + description;
+    break;
+  case ARGUMENT_MODIFIER_DEST:
+    name = DEST_JOURNAL_NAME;
+    description = "destination " + description;
+    break;
+  }
+  description += desc_suffix;
+
+  // TODO add validator
+  opt->add_options()
+    (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_pool_options(boost::program_options::options_description *pos,
+                      boost::program_options::options_description *opt) {
+  pos->add_options()
+    ("pool-name", "pool name");
+  opt->add_options()
+    ((POOL_NAME + ",p").c_str(), po::value<std::string>(), "pool name");
+}
+
 void add_image_spec_options(po::options_description *pos,
                             po::options_description *opt,
                             ArgumentModifier modifier) {
@@ -159,6 +190,20 @@ void add_image_or_snap_spec_options(po::options_description *pos,
   add_snap_option(opt, modifier);
 }
 
+void add_journal_spec_options(po::options_description *pos,
+			      po::options_description *opt,
+			      ArgumentModifier modifier) {
+
+  pos->add_options()
+    ((get_name_prefix(modifier) + JOURNAL_SPEC).c_str(),
+     (get_description_prefix(modifier) + "journal specification\n" +
+      "(example: [<pool-name>/]<journal-name>)").c_str());
+  add_pool_option(opt, modifier);
+  add_image_option(opt, modifier);
+  add_journal_option(opt, modifier);
+}
+
+
 void add_create_image_options(po::options_description *opt,
                               bool include_format) {
   // TODO get default image format from conf
@@ -173,11 +218,25 @@ void add_create_image_options(po::options_description *opt,
   opt->add_options()
     (IMAGE_ORDER.c_str(), po::value<ImageOrder>(),
      "object order [12 <= order <= 25]")
+    (IMAGE_OBJECT_SIZE.c_str(), po::value<ImageObjectSize>(),
+     "object size in B/K/M [4K <= object size <= 32M]")
     (IMAGE_FEATURES.c_str(), po::value<ImageFeatures>()->composing(),
      ("image features\n" + get_short_features_help(true)).c_str())
     (IMAGE_SHARED.c_str(), po::bool_switch(), "shared image")
-    (IMAGE_STRIPE_UNIT.c_str(), po::value<uint32_t>(), "stripe unit")
-    (IMAGE_STRIPE_COUNT.c_str(), po::value<uint32_t>(), "stripe count");
+    (IMAGE_STRIPE_UNIT.c_str(), po::value<uint64_t>(), "stripe unit")
+    (IMAGE_STRIPE_COUNT.c_str(), po::value<uint64_t>(), "stripe count");
+
+  add_create_journal_options(opt);
+}
+
+void add_create_journal_options(po::options_description *opt) {
+  opt->add_options()
+    (JOURNAL_SPLAY_WIDTH.c_str(), po::value<uint64_t>(),
+     "number of active journal objects")
+    (JOURNAL_OBJECT_SIZE.c_str(), po::value<JournalObjectSize>(),
+     "size of journal objects")
+    (JOURNAL_POOL.c_str(), po::value<std::string>(),
+     "pool for journal objects");
 }
 
 void add_size_option(boost::program_options::options_description *opt) {
@@ -207,6 +266,16 @@ void add_format_options(boost::program_options::options_description *opt) {
      "pretty formatting (json and xml)");
 }
 
+void add_verbose_option(boost::program_options::options_description *opt) {
+  opt->add_options()
+    (VERBOSE.c_str(), po::bool_switch(), "be verbose");
+}
+
+void add_no_error_option(boost::program_options::options_description *opt) {
+  opt->add_options()
+    (NO_ERROR.c_str(), po::bool_switch(), "continue after error");
+}
+
 std::string get_short_features_help(bool append_suffix) {
   std::ostringstream oss;
   bool first_feature = true;
@@ -268,7 +337,7 @@ void validate(boost::any& v, const std::vector<std::string>& values,
   po::validators::check_first_occurrence(v);
   const std::string &s = po::validators::get_single_string(values);
   try {
-    uint32_t order = boost::lexical_cast<uint32_t>(s);
+    uint64_t order = boost::lexical_cast<uint64_t>(s);
     if (order >= 12 && order <= 25) {
       v = boost::any(order);
       return;
@@ -279,6 +348,19 @@ void validate(boost::any& v, const std::vector<std::string>& values,
 }
 
 void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageObjectSize *target_type, int dummy) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  
+  std::string parse_error;
+  uint64_t objectsize = strict_sistrtoll(s.c_str(), &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+  v = boost::any(objectsize);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
               ImageFormat *target_type, int dummy) {
   po::validators::check_first_occurrence(v);
   const std::string &s = po::validators::get_single_string(values);
@@ -338,5 +420,19 @@ void validate(boost::any& v, const std::vector<std::string>& values,
   }
 }
 
+void validate(boost::any& v, const std::vector<std::string>& values,
+              JournalObjectSize *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+
+  std::string parse_error;
+  uint64_t size = strict_sistrtoll(s.c_str(), &parse_error);
+  if (parse_error.empty() && (size >= (1 << 12))) {
+    v = boost::any(size);
+    return;
+  }
+  throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
 } // namespace argument_types
 } // namespace rbd
diff --git a/src/tools/rbd/ArgumentTypes.h b/src/tools/rbd/ArgumentTypes.h
index 47ad55f..8313cf0 100644
--- a/src/tools/rbd/ArgumentTypes.h
+++ b/src/tools/rbd/ArgumentTypes.h
@@ -40,14 +40,18 @@ static const std::string POSITIONAL_ARGUMENTS("positional-arguments");
 static const std::string IMAGE_SPEC("image-spec");
 static const std::string SNAPSHOT_SPEC("snap-spec");
 static const std::string IMAGE_OR_SNAPSHOT_SPEC("image-or-snap-spec");
+static const std::string JOURNAL_SPEC("journal-spec");
 static const std::string PATH_NAME("path-name");
 
 // optional arguments
+static const std::string CONFIG_PATH("conf");
 static const std::string POOL_NAME("pool");
 static const std::string DEST_POOL_NAME("dest-pool");
 static const std::string IMAGE_NAME("image");
 static const std::string DEST_IMAGE_NAME("dest");
 static const std::string SNAPSHOT_NAME("snap");
+static const std::string JOURNAL_NAME("journal");
+static const std::string DEST_JOURNAL_NAME("dest-journal");
 static const std::string PATH("path");
 static const std::string FROM_SNAPSHOT_NAME("from-snap");
 static const std::string WHOLE_OBJECT("whole-object");
@@ -55,21 +59,29 @@ static const std::string WHOLE_OBJECT("whole-object");
 static const std::string IMAGE_FORMAT("image-format");
 static const std::string IMAGE_NEW_FORMAT("new-format");
 static const std::string IMAGE_ORDER("order");
+static const std::string IMAGE_OBJECT_SIZE("object-size");
 static const std::string IMAGE_FEATURES("image-feature");
 static const std::string IMAGE_SHARED("image-shared");
 static const std::string IMAGE_SIZE("size");
 static const std::string IMAGE_STRIPE_UNIT("stripe-unit");
 static const std::string IMAGE_STRIPE_COUNT("stripe-count");
 
+static const std::string JOURNAL_OBJECT_SIZE("journal-object-size");
+static const std::string JOURNAL_SPLAY_WIDTH("journal-splay-width");
+static const std::string JOURNAL_POOL("journal-pool");
+
 static const std::string NO_PROGRESS("no-progress");
 static const std::string FORMAT("format");
 static const std::string PRETTY_FORMAT("pretty-format");
+static const std::string VERBOSE("verbose");
+static const std::string NO_ERROR("no-error");
 
 static const std::set<std::string> SWITCH_ARGUMENTS = {
-  WHOLE_OBJECT, NO_PROGRESS, PRETTY_FORMAT};
+  WHOLE_OBJECT, NO_PROGRESS, PRETTY_FORMAT, VERBOSE, NO_ERROR};
 
 struct ImageSize {};
 struct ImageOrder {};
+struct ImageObjectSize {};
 struct ImageFormat {};
 struct ImageNewFormat {};
 
@@ -93,6 +105,8 @@ struct Format : public TypedValue<std::string> {
   Formatter create_formatter(bool pretty) const;
 };
 
+struct JournalObjectSize {};
+
 std::string get_name_prefix(ArgumentModifier modifier);
 std::string get_description_prefix(ArgumentModifier modifier);
 
@@ -107,6 +121,13 @@ void add_image_option(boost::program_options::options_description *opt,
 void add_snap_option(boost::program_options::options_description *opt,
                      ArgumentModifier modifier);
 
+void add_journal_option(boost::program_options::options_description *opt,
+                      ArgumentModifier modifier,
+                      const std::string &desc_suffix = "");
+
+void add_pool_options(boost::program_options::options_description *pos,
+                      boost::program_options::options_description *opt);
+
 void add_image_spec_options(boost::program_options::options_description *pos,
                             boost::program_options::options_description *opt,
                             ArgumentModifier modifier);
@@ -120,9 +141,17 @@ void add_image_or_snap_spec_options(
   boost::program_options::options_description *opt,
   ArgumentModifier modifier);
 
+void add_journal_spec_options(
+  boost::program_options::options_description *pos,
+  boost::program_options::options_description *opt,
+  ArgumentModifier modifier);
+
 void add_create_image_options(boost::program_options::options_description *opt,
                               bool include_format);
 
+void add_create_journal_options(
+  boost::program_options::options_description *opt);
+
 void add_size_option(boost::program_options::options_description *opt);
 
 void add_path_options(boost::program_options::options_description *pos,
@@ -133,6 +162,10 @@ void add_no_progress_option(boost::program_options::options_description *opt);
 
 void add_format_options(boost::program_options::options_description *opt);
 
+void add_verbose_option(boost::program_options::options_description *opt);
+
+void add_no_error_option(boost::program_options::options_description *opt);
+
 std::string get_short_features_help(bool append_suffix);
 std::string get_long_features_help();
 
@@ -141,6 +174,8 @@ void validate(boost::any& v, const std::vector<std::string>& values,
 void validate(boost::any& v, const std::vector<std::string>& values,
               ImageOrder *target_type, int);
 void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageObjectSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
               ImageFormat *target_type, int);
 void validate(boost::any& v, const std::vector<std::string>& values,
               ImageNewFormat *target_type, int);
@@ -148,6 +183,8 @@ void validate(boost::any& v, const std::vector<std::string>& values,
               ImageFeatures *target_type, int);
 void validate(boost::any& v, const std::vector<std::string>& values,
               Format *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+              JournalObjectSize *target_type, int);
 
 std::ostream &operator<<(std::ostream &os, const ImageFeatures &features);
 
diff --git a/src/tools/rbd/Shell.cc b/src/tools/rbd/Shell.cc
index 3e2987b..57af89a 100644
--- a/src/tools/rbd/Shell.cc
+++ b/src/tools/rbd/Shell.cc
@@ -234,7 +234,7 @@ Shell::Action *Shell::find_action(const CommandSpec &command_spec,
 
 void Shell::get_global_options(po::options_description *opts) {
   opts->add_options()
-    ("conf,c", po::value<std::string>(), "path to cluster configuration")
+    ((at::CONFIG_PATH + ",c").c_str(), po::value<std::string>(), "path to cluster configuration")
     ("cluster", po::value<std::string>(), "cluster name")
     ("id", po::value<std::string>(), "client id (without 'client.' prefix)")
     ("user", po::value<std::string>(), "client id (without 'client.' prefix)")
diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc
index 02af9ef..3f6cc2a 100644
--- a/src/tools/rbd/Utils.cc
+++ b/src/tools/rbd/Utils.cc
@@ -114,6 +114,24 @@ std::string get_positional_argument(const po::variables_map &vm, size_t index) {
   return "";
 }
 
+std::string get_pool_name(const po::variables_map &vm,
+                          size_t *arg_index) {
+  std::string pool_name;
+  if (vm.count(at::POOL_NAME)) {
+    pool_name = vm[at::POOL_NAME].as<std::string>();
+  } else {
+    pool_name = get_positional_argument(vm, *arg_index);
+    if (!pool_name.empty()) {
+       ++(*arg_index);
+    }
+  }
+
+  if (pool_name.empty()) {
+    pool_name = at::DEFAULT_POOL_NAME;
+  }
+  return pool_name;
+}
+
 int get_pool_image_snapshot_names(const po::variables_map &vm,
                                   at::ArgumentModifier mod,
                                   size_t *spec_arg_index,
@@ -178,6 +196,97 @@ int get_pool_image_snapshot_names(const po::variables_map &vm,
   return 0;
 }
 
+int get_pool_journal_names(const po::variables_map &vm,
+			   at::ArgumentModifier mod,
+			   size_t *spec_arg_index,
+			   std::string *pool_name,
+			   std::string *journal_name) {
+  std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+    at::DEST_POOL_NAME : at::POOL_NAME);
+  std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+    at::DEST_IMAGE_NAME : at::IMAGE_NAME);
+  std::string journal_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+    at::DEST_JOURNAL_NAME : at::JOURNAL_NAME);
+
+  if (vm.count(pool_key) && pool_name != nullptr) {
+    *pool_name = vm[pool_key].as<std::string>();
+  }
+  if (vm.count(journal_key) && journal_name != nullptr) {
+    *journal_name = vm[journal_key].as<std::string>();
+  }
+
+  std::string image_name;
+  if (vm.count(image_key)) {
+    image_name = vm[image_key].as<std::string>();
+  }
+
+  if (journal_name != nullptr && !journal_name->empty()) {
+    // despite the separate pool option,
+    // we can also specify them via the journal option
+    std::string journal_name_copy(*journal_name);
+    extract_spec(journal_name_copy, pool_name, journal_name, nullptr);
+  }
+
+  if (!image_name.empty()) {
+    // despite the separate pool option,
+    // we can also specify them via the image option
+    std::string image_name_copy(image_name);
+    extract_spec(image_name_copy, pool_name, &image_name, nullptr);
+  }
+
+  int r;
+  if (journal_name != nullptr && spec_arg_index != nullptr &&
+      journal_name->empty()) {
+    std::string spec = get_positional_argument(vm, (*spec_arg_index)++);
+    if (!spec.empty()) {
+      r = extract_spec(spec, pool_name, journal_name, nullptr);
+      if (r < 0) {
+        return r;
+      }
+    }
+  }
+
+  if (pool_name->empty()) {
+    *pool_name = at::DEFAULT_POOL_NAME;
+  }
+
+  if (journal_name != nullptr && journal_name->empty() && !image_name.empty()) {
+    // Try to get journal name from image info.
+    librados::Rados rados;
+    librados::IoCtx io_ctx;
+    librbd::Image image;
+    int r = init_and_open_image(*pool_name, image_name, "", true,
+				  &rados, &io_ctx, &image);
+    if (r < 0) {
+      std::cerr << "rbd: failed to open image " << image_name
+		<< " to get journal name: " << cpp_strerror(r) << std::endl;
+      return r;
+    }
+
+    uint64_t features;
+    r = image.features(&features);
+    if (r < 0) {
+      return r;
+    }
+    if ((features & RBD_FEATURE_JOURNALING) == 0) {
+      std::cerr << "rbd: journaling is not enabled for image " << image_name
+		<< std::endl;
+      return -EINVAL;
+    }
+    *journal_name = image_id(image);
+  }
+
+  if (journal_name != nullptr && journal_name->empty()) {
+    std::string prefix = at::get_description_prefix(mod);
+    std::cerr << "rbd: "
+              << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+              << "journal was not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
 int validate_snapshot_name(at::ArgumentModifier mod,
                            const std::string &snap_name,
                            SnapshotPresence snapshot_presence) {
@@ -207,89 +316,143 @@ int validate_snapshot_name(at::ArgumentModifier mod,
 }
 
 int get_image_options(const boost::program_options::variables_map &vm,
-                      int *order, uint32_t *format, uint64_t *features,
-                      uint32_t *stripe_unit, uint32_t *stripe_count) {
+		      bool get_format, librbd::ImageOptions *opts) {
+  uint64_t order, features, stripe_unit, stripe_count, object_size;
+  bool features_specified = false;
+
   if (vm.count(at::IMAGE_ORDER)) {
-    *order = vm[at::IMAGE_ORDER].as<uint32_t>();
+    order = vm[at::IMAGE_ORDER].as<uint64_t>();
+    std::cerr << "rbd: --order is deprecated, use --object-size"
+	      << std::endl;
+  } else if (vm.count(at::IMAGE_OBJECT_SIZE)) {
+    object_size = vm[at::IMAGE_OBJECT_SIZE].as<uint64_t>();
+    order = std::round(std::log2(object_size)); 
   } else {
-    *order = 22;
+    order = 22;
   }
 
-  bool features_specified = false;
   if (vm.count(at::IMAGE_FEATURES)) {
-    *features = vm[at::IMAGE_FEATURES].as<uint64_t>();
+    features = vm[at::IMAGE_FEATURES].as<uint64_t>();
     features_specified = true;
   } else {
-    *features = g_conf->rbd_default_features;
+    features = g_conf->rbd_default_features;
   }
 
   if (vm.count(at::IMAGE_STRIPE_UNIT)) {
-    *stripe_unit = vm[at::IMAGE_STRIPE_UNIT].as<uint32_t>();
+    stripe_unit = vm[at::IMAGE_STRIPE_UNIT].as<uint64_t>();
   } else {
-    *stripe_unit = g_conf->rbd_default_stripe_unit;
+    stripe_unit = g_conf->rbd_default_stripe_unit;
   }
 
   if (vm.count(at::IMAGE_STRIPE_COUNT)) {
-    *stripe_count = vm[at::IMAGE_STRIPE_COUNT].as<uint32_t>();
+    stripe_count = vm[at::IMAGE_STRIPE_COUNT].as<uint64_t>();
   } else {
-    *stripe_count = g_conf->rbd_default_stripe_count;
+    stripe_count = g_conf->rbd_default_stripe_count;
   }
 
-  if ((*stripe_unit != 0 && *stripe_count == 0) ||
-      (*stripe_unit == 0 && *stripe_count != 0)) {
+  if ((stripe_unit != 0 && stripe_count == 0) ||
+      (stripe_unit == 0 && stripe_count != 0)) {
     std::cerr << "must specify both (or neither) of stripe-unit and stripe-count"
               << std::endl;
     return -EINVAL;
-  } else if ((*stripe_unit || *stripe_count) &&
-             (*stripe_unit != (1ll << *order) && *stripe_count != 1)) {
-    *features |= RBD_FEATURE_STRIPINGV2;
+  } else if ((stripe_unit || stripe_count) &&
+             (stripe_unit != (1ull << order) && stripe_count != 1)) {
+    features |= RBD_FEATURE_STRIPINGV2;
   } else {
-    *features &= ~RBD_FEATURE_STRIPINGV2;
+    features &= ~RBD_FEATURE_STRIPINGV2;
   }
 
   if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) {
-    *features &= ~RBD_FEATURES_SINGLE_CLIENT;
+    features &= ~RBD_FEATURES_SINGLE_CLIENT;
   }
 
-  if (format != nullptr) {
+  if (get_format) {
+    uint64_t format;
     bool format_specified = false;
     if (vm.count(at::IMAGE_NEW_FORMAT)) {
-      *format = 2;
+      format = 2;
       format_specified = true;
     } else if (vm.count(at::IMAGE_FORMAT)) {
-      *format = vm[at::IMAGE_FORMAT].as<uint32_t>();
+      format = vm[at::IMAGE_FORMAT].as<uint32_t>();
       format_specified = true;
     } else {
-      *format = g_conf->rbd_default_format;
+      format = g_conf->rbd_default_format;
     }
 
-    if (features_specified && *features != 0) {
-      if (format_specified && *format == 1) {
+    if (features_specified && features != 0) {
+      if (format_specified && format == 1) {
         std::cerr << "rbd: features not allowed with format 1; "
                   << "use --image-format 2" << std::endl;
         return -EINVAL;
       } else {
-        *format = 2;
+        format = 2;
         format_specified = true;
       }
     }
 
-    if ((*stripe_unit || *stripe_count) &&
-        (*stripe_unit != (1ull << *order) && *stripe_count != 1)) {
-      if (format_specified && *format == 1) {
+    if ((stripe_unit || stripe_count) &&
+        (stripe_unit != (1ull << order) && stripe_count != 1)) {
+      if (format_specified && format == 1) {
         std::cerr << "rbd: non-default striping not allowed with format 1; "
                   << "use --image-format 2" << std::endl;
         return -EINVAL;
       } else {
-        *format = 2;
+        format = 2;
         format_specified = 2;
       }
     }
 
     if (format_specified) {
-      int r = g_conf->set_val("rbd_default_format", stringify(*format));
+      int r = g_conf->set_val("rbd_default_format", stringify(format));
       assert(r == 0);
     }
+
+    opts->set(RBD_IMAGE_OPTION_FORMAT, format);
+  }
+
+  opts->set(RBD_IMAGE_OPTION_ORDER, order);
+  opts->set(RBD_IMAGE_OPTION_FEATURES, features);
+  opts->set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+  opts->set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+
+  int r = get_journal_options(vm, opts);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int get_journal_options(const boost::program_options::variables_map &vm,
+			librbd::ImageOptions *opts) {
+
+  if (vm.count(at::JOURNAL_OBJECT_SIZE)) {
+    uint64_t size = vm[at::JOURNAL_OBJECT_SIZE].as<uint64_t>();
+    uint64_t order = 12;
+    while ((1ULL << order) < size) {
+      order++;
+    }
+    opts->set(RBD_IMAGE_OPTION_JOURNAL_ORDER, order);
+
+    int r = g_conf->set_val("rbd_journal_order", stringify(order));
+    assert(r == 0);
+  }
+  if (vm.count(at::JOURNAL_SPLAY_WIDTH)) {
+    opts->set(RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
+	      vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>());
+
+    int r = g_conf->set_val("rbd_journal_splay_width",
+			    stringify(
+			      vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>()));
+    assert(r == 0);
+  }
+  if (vm.count(at::JOURNAL_POOL)) {
+    opts->set(RBD_IMAGE_OPTION_JOURNAL_POOL,
+	      vm[at::JOURNAL_POOL].as<std::string>());
+
+    int r = g_conf->set_val("rbd_journal_pool",
+			    vm[at::JOURNAL_POOL].as<std::string>());
+    assert(r == 0);
   }
 
   return 0;
@@ -427,5 +590,19 @@ int snap_set(librbd::Image &image, const std::string snap_name) {
   return 0;
 }
 
+std::string image_id(librbd::Image& image) {
+  librbd::image_info_t info;
+  int r = image.stat(info, sizeof(info));
+  if (r < 0) {
+    return string();
+  }
+
+  char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+  strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+  prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+  return string(prefix + strlen(RBD_DATA_PREFIX));
+}
+
 } // namespace utils
 } // namespace rbd
diff --git a/src/tools/rbd/Utils.h b/src/tools/rbd/Utils.h
index 0b7794e..0f290df 100644
--- a/src/tools/rbd/Utils.h
+++ b/src/tools/rbd/Utils.h
@@ -46,19 +46,29 @@ int extract_spec(const std::string &spec, std::string *pool_name,
 std::string get_positional_argument(
     const boost::program_options::variables_map &vm, size_t index);
 
+std::string get_pool_name(const boost::program_options::variables_map &vm,
+                          size_t *arg_index);
+
 int get_pool_image_snapshot_names(
     const boost::program_options::variables_map &vm,
     argument_types::ArgumentModifier mod, size_t *spec_arg_index,
     std::string *pool_name, std::string *image_name, std::string *snap_name,
     SnapshotPresence snapshot_presence, bool image_required = true);
 
+int get_pool_journal_names(
+    const boost::program_options::variables_map &vm,
+    argument_types::ArgumentModifier mod, size_t *spec_arg_index,
+    std::string *pool_name, std::string *journal_name);
+
 int validate_snapshot_name(argument_types::ArgumentModifier mod,
                            const std::string &snap_name,
                            SnapshotPresence snapshot_presence);
 
 int get_image_options(const boost::program_options::variables_map &vm,
-                      int *order, uint32_t *format, uint64_t *features,
-                      uint32_t *stripe_unit, uint32_t *stripe_count);
+                      bool get_format, librbd::ImageOptions* opts);
+
+int get_journal_options(const boost::program_options::variables_map &vm,
+			librbd::ImageOptions *opts);
 
 int get_image_size(const boost::program_options::variables_map &vm,
                    uint64_t *size);
@@ -88,6 +98,8 @@ int init_and_open_image(const std::string &pool_name,
 
 int snap_set(librbd::Image &image, const std::string snap_name);
 
+std::string image_id(librbd::Image& image);
+
 } // namespace utils
 } // namespace rbd
 
diff --git a/src/tools/rbd/action/Clone.cc b/src/tools/rbd/action/Clone.cc
index 6c98433..df24349 100644
--- a/src/tools/rbd/action/Clone.cc
+++ b/src/tools/rbd/action/Clone.cc
@@ -18,14 +18,16 @@ namespace po = boost::program_options;
 int do_clone(librbd::RBD &rbd, librados::IoCtx &p_ioctx,
              const char *p_name, const char *p_snapname,
              librados::IoCtx &c_ioctx, const char *c_name,
-             uint64_t features, int *c_order,
-             uint64_t stripe_unit, uint64_t stripe_count) {
+             librbd::ImageOptions& opts) {
+  uint64_t features;
+  int r = opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
+  assert(r == 0);
+
   if ((features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
     return -EINVAL;
   }
 
-  return rbd.clone2(p_ioctx, p_name, p_snapname, c_ioctx, c_name, features,
-                    c_order, stripe_unit, stripe_count);
+  return rbd.clone3(p_ioctx, p_name, p_snapname, c_ioctx, c_name, opts);
 }
 
 void get_arguments(po::options_description *positional,
@@ -57,12 +59,8 @@ int execute(const po::variables_map &vm) {
     return r;
   }
 
-  int order;
-  uint64_t features;
-  uint32_t stripe_unit;
-  uint32_t stripe_count;
-  r = utils::get_image_options(vm, &order, nullptr, &features, &stripe_unit,
-                               &stripe_count);
+  librbd::ImageOptions opts;
+  r = utils::get_image_options(vm, false, &opts);
   if (r < 0) {
     return r;
   }
@@ -82,8 +80,7 @@ int execute(const po::variables_map &vm) {
 
   librbd::RBD rbd;
   r = do_clone(rbd, io_ctx, image_name.c_str(), snap_name.c_str(), dst_io_ctx,
-               dst_image_name.c_str(), features, &order, stripe_unit,
-               stripe_count);
+               dst_image_name.c_str(), opts);
   if (r < 0) {
     std::cerr << "rbd: clone error: " << cpp_strerror(r) << std::endl;
     return r;
diff --git a/src/tools/rbd/action/Copy.cc b/src/tools/rbd/action/Copy.cc
index 9275e4b..7ab53ae 100644
--- a/src/tools/rbd/action/Copy.cc
+++ b/src/tools/rbd/action/Copy.cc
@@ -16,10 +16,11 @@ namespace at = argument_types;
 namespace po = boost::program_options;
 
 static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp,
-                   const char *destname, bool no_progress)
+		   const char *destname, librbd::ImageOptions& opts,
+		   bool no_progress)
 {
   utils::ProgressContext pc("Image copy", no_progress);
-  int r = src.copy_with_progress(dest_pp, destname, pc);
+  int r = src.copy_with_progress3(dest_pp, destname, opts, pc);
   if (r < 0){
     pc.fail();
     return r;
@@ -33,6 +34,7 @@ void get_arguments(po::options_description *positional,
   at::add_image_or_snap_spec_options(positional, options,
                                      at::ARGUMENT_MODIFIER_SOURCE);
   at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+  at::add_create_image_options(options, false);
   at::add_no_progress_option(options);
 }
 
@@ -58,6 +60,12 @@ int execute(const po::variables_map &vm) {
     return r;
   }
 
+  librbd::ImageOptions opts;
+  r = utils::get_image_options(vm, false, &opts);
+  if (r < 0) {
+    return r;
+  }
+
   librados::Rados rados;
   librados::IoCtx io_ctx;
   librbd::Image image;
@@ -73,7 +81,7 @@ int execute(const po::variables_map &vm) {
     return r;
   }
 
-  r = do_copy(image, dst_io_ctx, dst_image_name.c_str(),
+  r = do_copy(image, dst_io_ctx, dst_image_name.c_str(), opts,
               vm[at::NO_PROGRESS].as<bool>());
   if (r < 0) {
     std::cerr << "rbd: copy failed: " << cpp_strerror(r) << std::endl;
@@ -83,7 +91,8 @@ int execute(const po::variables_map &vm) {
 }
 
 Shell::Action action(
-  {"copy"}, {"cp"}, "Copy src image to dest.", "", &get_arguments, &execute);
+  {"copy"}, {"cp"}, "Copy src image to dest.", at::get_long_features_help(),
+  &get_arguments, &execute);
 
 } // namespace copy
 } // namespace action
diff --git a/src/tools/rbd/action/Create.cc b/src/tools/rbd/action/Create.cc
index 49eedb6..5891939 100644
--- a/src/tools/rbd/action/Create.cc
+++ b/src/tools/rbd/action/Create.cc
@@ -16,15 +16,20 @@ namespace at = argument_types;
 namespace po = boost::program_options;
 
 static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx,
-                     const char *imgname, uint64_t size, int *order,
-                     int format, uint64_t features,
-                     uint64_t stripe_unit, uint64_t stripe_count) {
+                     const char *imgname, uint64_t size,
+		     librbd::ImageOptions& opts) {
   int r;
+  uint64_t format;
+  r = opts.get(RBD_IMAGE_OPTION_FORMAT, &format);
+  assert(r == 0);
   if (format == 1) {
-    r = rbd.create(io_ctx, imgname, size, order);
+    uint64_t order;
+    r = opts.get(RBD_IMAGE_OPTION_ORDER, &order);
+    assert(r == 0);
+    int order_ = order;
+    r = rbd.create(io_ctx, imgname, size, &order_);
   } else {
-    r = rbd.create3(io_ctx, imgname, size, features, order,
-                    stripe_unit, stripe_count);
+    r = rbd.create4(io_ctx, imgname, size, opts);
   }
   if (r < 0) {
     return r;
@@ -51,13 +56,8 @@ int execute(const po::variables_map &vm) {
     return r;
   }
 
-  int order;
-  uint32_t format;
-  uint64_t features;
-  uint32_t stripe_unit;
-  uint32_t stripe_count;
-  r = utils::get_image_options(vm, &order, &format, &features, &stripe_unit,
-                               &stripe_count);
+  librbd::ImageOptions opts;
+  r = utils::get_image_options(vm, true, &opts);
   if (r < 0) {
     return r;
   }
@@ -76,8 +76,7 @@ int execute(const po::variables_map &vm) {
   }
 
   librbd::RBD rbd;
-  r = do_create(rbd, io_ctx, image_name.c_str(), size, &order, format, features,
-                stripe_unit, stripe_count);
+  r = do_create(rbd, io_ctx, image_name.c_str(), size, opts);
   if (r < 0) {
     std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl;
     return r;
diff --git a/src/tools/rbd/action/DiskUsage.cc b/src/tools/rbd/action/DiskUsage.cc
index 8e59ffe..580192f 100644
--- a/src/tools/rbd/action/DiskUsage.cc
+++ b/src/tools/rbd/action/DiskUsage.cc
@@ -131,7 +131,7 @@ static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
     if (r < 0) {
       std::cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r)
                 << std::endl;
-      return r;
+      goto out;
     }
     if ((features & RBD_FEATURE_FAST_DIFF) == 0) {
       std::cerr << "warning: fast-diff map is not enabled for " << *name << ". "
@@ -140,7 +140,8 @@ static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
 
     librbd::image_info_t info;
     if (image.stat(info, sizeof(info)) < 0) {
-      return -EINVAL;
+      r = -EINVAL;
+      goto out;
     }
 
     std::vector<librbd::snap_info_t> snap_list;
@@ -163,7 +164,7 @@ static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
       if (r < 0) {
         std::cerr << "rbd: error opening snapshot " << *name << "@"
                   << snap->name << ": " << cpp_strerror(r) << std::endl;
-        return r;
+        goto out;
       }
 
       if (imgname == NULL || (snapname != NULL && snap->name == snapname)) {
@@ -171,7 +172,7 @@ static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
                                      snap_image, snap->size, tbl, f,
                                      &used_size);
         if (r < 0) {
-          return r;
+          goto out;
         }
 
         if (snapname != NULL) {
@@ -186,13 +187,14 @@ static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
       r = compute_image_disk_usage(*name, "", last_snap_name, image, info.size,
                                    tbl, f, &used_size);
       if (r < 0) {
-        return r;
+        goto out;
       }
       total_prov += info.size;
       total_used += used_size;
     }
   }
 
+out:
   if (f) {
     f->close_section();
     if (imgname == NULL) {
@@ -211,7 +213,7 @@ static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
     std::cout << tbl;
   }
 
-  return 0;
+  return r < 0 ? r : 0;
 }
 
 void get_arguments(po::options_description *positional,
diff --git a/src/tools/rbd/action/Feature.cc b/src/tools/rbd/action/Feature.cc
index 4bd61a6..12d4dd8 100644
--- a/src/tools/rbd/action/Feature.cc
+++ b/src/tools/rbd/action/Feature.cc
@@ -4,8 +4,10 @@
 #include "tools/rbd/ArgumentTypes.h"
 #include "tools/rbd/Shell.h"
 #include "tools/rbd/Utils.h"
+#include "include/stringify.h"
 #include "common/errno.h"
 #include <iostream>
+#include <map>
 #include <boost/program_options.hpp>
 
 namespace rbd {
@@ -16,11 +18,24 @@ namespace at = argument_types;
 namespace po = boost::program_options;
 
 void get_arguments(po::options_description *positional,
-                   po::options_description *options) {
+                   po::options_description *options, bool enabled) {
   at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
   positional->add_options()
     ("features", po::value<at::ImageFeatures>()->multitoken(),
      ("image features\n" + at::get_short_features_help(false)).c_str());
+  if (enabled) {
+    at::add_create_journal_options(options);
+  }
+}
+
+void get_arguments_disable(po::options_description *positional,
+			   po::options_description *options) {
+  get_arguments(positional, options, false);
+}
+
+void get_arguments_enable(po::options_description *positional,
+			  po::options_description *options) {
+  get_arguments(positional, options, true);
 }
 
 int execute(const po::variables_map &vm, bool enabled) {
@@ -35,6 +50,12 @@ int execute(const po::variables_map &vm, bool enabled) {
     return r;
   }
 
+  librbd::ImageOptions opts;
+  r = utils::get_journal_options(vm, &opts);
+  if (r < 0) {
+    return r;
+  }
+
   const std::vector<std::string> &args = vm[at::POSITIONAL_ARGUMENTS]
     .as<std::vector<std::string> >();
   std::vector<std::string> feature_names(args.begin() + 1, args.end());
@@ -76,10 +97,10 @@ int execute_enable(const po::variables_map &vm) {
 
 Shell::Action action_disable(
   {"feature", "disable"}, {}, "Disable the specified image feature.", "",
-  &get_arguments, &execute_disable);
+  &get_arguments_disable, &execute_disable);
 Shell::Action action_enable(
   {"feature", "enable"}, {}, "Enable the specified image feature.", "",
-  &get_arguments, &execute_enable);
+  &get_arguments_enable, &execute_enable);
 
 } // namespace feature
 } // namespace action
diff --git a/src/tools/rbd/action/Import.cc b/src/tools/rbd/action/Import.cc
index bb7cb7d..e7bf4d1 100644
--- a/src/tools/rbd/action/Import.cc
+++ b/src/tools/rbd/action/Import.cc
@@ -64,10 +64,8 @@ private:
 };
 
 static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
-                     const char *imgname, int *order, const char *path,
-                     int format, uint64_t features,
-                     uint64_t stripe_unit, uint64_t stripe_count,
-                     bool no_progress)
+                     const char *imgname, const char *path,
+		     librbd::ImageOptions& opts, bool no_progress)
 {
   int fd, r;
   struct stat stat_buf;
@@ -75,13 +73,13 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
 
   assert(imgname);
 
-  // default order as usual
-  if (*order == 0)
-    *order = 22;
+  uint64_t order;
+  r = opts.get(RBD_IMAGE_OPTION_ORDER, &order);
+  assert(r == 0);
 
   // try to fill whole imgblklen blocks for sparsification
   uint64_t image_pos = 0;
-  size_t imgblklen = 1 << *order;
+  size_t imgblklen = 1 << order;
   char *p = new char[imgblklen];
   size_t reqlen = imgblklen;    // amount requested from read
   ssize_t readlen;              // amount received from one read
@@ -94,7 +92,7 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
   if (from_stdin) {
     throttle.reset(new SimpleThrottle(1, false));
     fd = 0;
-    size = 1ULL << *order;
+    size = 1ULL << order;
   } else {
     throttle.reset(new SimpleThrottle(
       max(g_conf->rbd_concurrent_management_ops, 1), false));
@@ -132,18 +130,27 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
     posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
   }
 
+  uint64_t format;
+  r = opts.get(RBD_IMAGE_OPTION_FORMAT, &format);
+  assert(r == 0);
   if (format == 1) {
+    uint64_t stripe_unit, stripe_count;
+    r = opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit);
+    assert(r == 0);
+    r = opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count);
+    assert(r == 0);
+
     // weird striping not allowed with format 1!
     if ((stripe_unit || stripe_count) &&
-        (stripe_unit != (1ull << *order) && stripe_count != 1)) {
+        (stripe_unit != (1ull << order) && stripe_count != 1)) {
       std::cerr << "non-default striping not allowed with format 1; "
                 << "use --image-format 2" << std::endl;
       return -EINVAL;
     }
-    r = rbd.create(io_ctx, imgname, size, order);
+    int order_ = order;
+    r = rbd.create(io_ctx, imgname, size, &order_);
   } else {
-    r = rbd.create3(io_ctx, imgname, size, features, order,
-                    stripe_unit, stripe_count);
+    r = rbd.create4(io_ctx, imgname, size, opts);
   }
   if (r < 0) {
     std::cerr << "rbd: image creation failed" << std::endl;
@@ -280,13 +287,8 @@ int execute(const po::variables_map &vm) {
     image_name = deprecated_image_name;
   }
 
-  int order;
-  uint32_t format;
-  uint64_t features;
-  uint32_t stripe_unit;
-  uint32_t stripe_count;
-  r = utils::get_image_options(vm, &order, &format, &features, &stripe_unit,
-                               &stripe_count);
+  librbd::ImageOptions opts;
+  r = utils::get_image_options(vm, true, &opts);
   if (r < 0) {
     return r;
   }
@@ -299,9 +301,8 @@ int execute(const po::variables_map &vm) {
   }
 
   librbd::RBD rbd;
-  r = do_import(rbd, io_ctx, image_name.c_str(), &order, path.c_str(),
-                format, features, stripe_unit, stripe_count,
-                vm[at::NO_PROGRESS].as<bool>());
+  r = do_import(rbd, io_ctx, image_name.c_str(), path.c_str(),
+                opts, vm[at::NO_PROGRESS].as<bool>());
   if (r < 0) {
     std::cerr << "rbd: import failed: " << cpp_strerror(r) << std::endl;
     return r;
diff --git a/src/tools/rbd/action/Info.cc b/src/tools/rbd/action/Info.cc
index 76e3940..f3d81ac 100644
--- a/src/tools/rbd/action/Info.cc
+++ b/src/tools/rbd/action/Info.cc
@@ -171,6 +171,14 @@ static int do_show_info(const char *imgname, librbd::Image& image,
     }
   }
 
+  if (features & RBD_FEATURE_JOURNALING) {
+    if (f) {
+      f->dump_string("journal", utils::image_id(image));
+    } else {
+      std::cout << "\tjournal: " << utils::image_id(image) << std::endl;
+    }
+  }
+
   if (f) {
     f->close_section();
     f->flush(std::cout);
diff --git a/src/tools/rbd/action/Journal.cc b/src/tools/rbd/action/Journal.cc
new file mode 100644
index 0000000..e00665b
--- /dev/null
+++ b/src/tools/rbd/action/Journal.cc
@@ -0,0 +1,969 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/Cond.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "include/stringify.h"
+#include <fstream>
+#include <sstream>
+#include <boost/program_options.hpp>
+
+#include "cls/journal/cls_journal_types.h"
+#include "cls/journal/cls_journal_client.h"
+
+#include "journal/Journaler.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+//#include "librbd/Journal.h" // XXXMG: for librbd::Journal::reset()
+#include "librbd/JournalTypes.h"
+
+namespace rbd {
+namespace action {
+namespace journal {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_show_journal_info(librados::Rados& rados, librados::IoCtx& io_ctx,
+				const std::string& journal_id, Formatter *f)
+{
+  int r;
+  C_SaferCond cond;
+
+  std::string header_oid = ::journal::Journaler::header_oid(journal_id);
+  std::string object_oid_prefix = ::journal::Journaler::object_oid_prefix(
+    io_ctx.get_id(), journal_id);
+  uint8_t order;
+  uint8_t splay_width;
+  int64_t pool_id;
+
+  cls::journal::client::get_immutable_metadata(io_ctx, header_oid, &order,
+					       &splay_width, &pool_id, &cond);
+  r = cond.wait();
+  if (r < 0) {
+    std::cerr << "failed to get journal metadata: "  << cpp_strerror(r)
+	      << std::endl;
+    return r;
+  }
+
+  std::string object_pool_name;
+  if (pool_id >= 0) {
+    r = rados.pool_reverse_lookup(pool_id, &object_pool_name);
+    if (r < 0) {
+      std::cerr << "error looking up pool name for pool_id=" << pool_id << ": "
+		<< cpp_strerror(r) << std::endl;
+    }
+  }
+
+  if (f) {
+    f->open_object_section("journal");
+    f->dump_string("journal_id", journal_id);
+    f->dump_string("header_oid", header_oid);
+    f->dump_string("object_oid_prefix", object_oid_prefix);
+    f->dump_int("order", order);
+    f->dump_int("splay_width", splay_width);
+    if (!object_pool_name.empty()) {
+      f->dump_string("object_pool", object_pool_name);
+    }
+    f->close_section();
+    f->flush(std::cout);
+  } else {
+    std::cout << "rbd journal '" << journal_id << "':" << std::endl;
+    std::cout << "\theader_oid: " << header_oid << std::endl;
+    std::cout << "\tobject_oid_prefix: " << object_oid_prefix << std::endl;
+    std::cout << "\torder: " << static_cast<int>(order) << " ("
+	      << prettybyte_t(1ull << order) << " objects)"<< std::endl;
+    std::cout << "\tsplay_width: " << static_cast<int>(splay_width) << std::endl;
+    if (!object_pool_name.empty()) {
+      std::cout << "\tobject_pool: " << object_pool_name << std::endl;
+    }
+  }
+  return 0;
+}
+
+static int do_show_journal_status(librados::IoCtx& io_ctx,
+				  const std::string& journal_id, Formatter *f)
+{
+  int r;
+
+  C_SaferCond cond;
+  uint64_t minimum_set;
+  uint64_t active_set;
+  std::set<cls::journal::Client> registered_clients;
+  std::string oid = ::journal::Journaler::header_oid(journal_id);
+
+  cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set,
+                                            &active_set, &registered_clients,
+                                            &cond);
+  r = cond.wait();
+  if (r < 0) {
+    std::cerr << "warning: failed to get journal metadata" << std::endl;
+    return r;
+  }
+
+  if (f) {
+    f->open_object_section("status");
+    f->dump_unsigned("minimum_set", minimum_set);
+    f->dump_unsigned("active_set", active_set);
+    f->open_object_section("registered_clients");
+    for (std::set<cls::journal::Client>::iterator c =
+          registered_clients.begin(); c != registered_clients.end(); c++) {
+      c->dump(f);
+    }
+    f->close_section();
+    f->close_section();
+    f->flush(std::cout);
+  } else {
+    std::cout << "minimum_set: " << minimum_set << std::endl;
+    std::cout << "active_set: " << active_set << std::endl;
+    std::cout << "registered clients: " << std::endl;
+    for (std::set<cls::journal::Client>::iterator c =
+          registered_clients.begin(); c != registered_clients.end(); c++) {
+      std::cout << "\t" << *c << std::endl;
+    }
+  }
+  return 0;
+}
+
+static int do_reset_journal(librados::IoCtx& io_ctx,
+			    const std::string& journal_id)
+{
+  // XXXMG: does not work due to a linking issue
+  //return librbd::Journal::reset(io_ctx, journal_id);
+
+  ::journal::Journaler journaler(io_ctx, journal_id, "", 5);
+
+  C_SaferCond cond;
+  journaler.init(&cond);
+
+  int r = cond.wait();
+  if (r < 0) {
+    std::cerr << "failed to initialize journal: " << cpp_strerror(r)
+	      << std::endl;
+    return r;
+  }
+
+  uint8_t order, splay_width;
+  int64_t pool_id;
+  journaler.get_metadata(&order, &splay_width, &pool_id);
+
+  r = journaler.remove(true);
+  if (r < 0) {
+    std::cerr << "failed to reset journal: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  r = journaler.create(order, splay_width, pool_id);
+  if (r < 0) {
+    std::cerr << "failed to create journal: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  // XXXMG
+  const std::string CLIENT_DESCRIPTION = "master image";
+
+  r = journaler.register_client(CLIENT_DESCRIPTION);
+  if (r < 0) {
+    std::cerr << "failed to register client: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+class Journaler : public ::journal::Journaler {
+public:
+  Journaler(librados::IoCtx& io_ctx, const std::string& journal_id,
+	    const std::string &client_id) :
+    ::journal::Journaler(io_ctx, journal_id, client_id, 5) {
+  }
+
+  int init() {
+    int r;
+
+    r = register_client("rbd journal");
+    if (r < 0) {
+      std::cerr << "failed to register client: " << cpp_strerror(r)
+		<< std::endl;
+      return r;
+    }
+
+    C_SaferCond cond;
+
+    ::journal::Journaler::init(&cond);
+    r = cond.wait();
+    if (r < 0) {
+      std::cerr << "failed to initialize journal: " << cpp_strerror(r)
+		<< std::endl;
+      (void) unregister_client();
+      return r;
+    }
+
+    return 0;
+  }
+
+  int shutdown() {
+    ::journal::Journaler::shutdown();
+
+    int r = unregister_client();
+    if (r < 0) {
+      std::cerr << "rbd: failed to unregister journal client: "
+		<< cpp_strerror(r) << std::endl;
+    }
+    return r;
+  }
+};
+
+class JournalPlayer {
+public:
+  JournalPlayer(librados::IoCtx& io_ctx, const std::string& journal_id,
+		const std::string &client_id) :
+    m_journaler(io_ctx, journal_id, client_id),
+    m_cond(),
+    m_r(0) {
+  }
+
+  virtual ~JournalPlayer() {}
+
+  virtual int exec() {
+    int r;
+
+    r = m_journaler.init();
+    if (r < 0) {
+      return r;
+    }
+
+    ReplayHandler replay_handler(this);
+
+    m_journaler.start_replay(&replay_handler);
+
+    r = m_cond.wait();
+
+    if (r < 0) {
+      std::cerr << "rbd: failed to process journal: " << cpp_strerror(r)
+		<< std::endl;
+      if (m_r == 0) {
+       m_r = r;
+      }
+    }
+
+    r = m_journaler.shutdown();
+    if (r < 0 && m_r == 0) {
+      m_r = r;
+    }
+
+    return m_r;
+  }
+
+protected:
+  struct ReplayHandler : public ::journal::ReplayHandler {
+    JournalPlayer *journal;
+    ReplayHandler(JournalPlayer *_journal) : journal(_journal) {}
+
+    virtual void get() {}
+    virtual void put() {}
+
+    virtual void handle_entries_available() {
+      journal->handle_replay_ready();
+    }
+    virtual void handle_complete(int r) {
+      journal->handle_replay_complete(r);
+    }
+  };
+
+  void handle_replay_ready() {
+    int r = 0;
+    while (true) {
+      ::journal::ReplayEntry replay_entry;
+      std::string tag;
+      if (!m_journaler.try_pop_front(&replay_entry, &tag)) {
+	break;
+      }
+
+      r = process_entry(replay_entry, tag);
+      if (r < 0) {
+	break;
+      }
+    }
+  }
+
+  virtual int process_entry(::journal::ReplayEntry replay_entry,
+			    std::string& tag) = 0;
+
+  void handle_replay_complete(int r) {
+    m_journaler.stop_replay();
+    m_cond.complete(r);
+  }
+
+  Journaler m_journaler;
+  C_SaferCond m_cond;
+  int m_r;
+};
+
+static int inspect_entry(bufferlist& data,
+			 librbd::journal::EventEntry& event_entry,
+			 bool verbose) {
+  try {
+    bufferlist::iterator it = data.begin();
+    ::decode(event_entry, it);
+  } catch (const buffer::error &err) {
+    std::cerr << "failed to decode event entry: " << err.what() << std::endl;
+    return -EINVAL;
+  }
+  if (verbose) {
+    JSONFormatter f(true);
+    f.open_object_section("event_entry");
+    event_entry.dump(&f);
+    f.close_section();
+    f.flush(std::cout);
+  }
+  return 0;
+}
+
+class JournalInspector : public JournalPlayer {
+public:
+  JournalInspector(librados::IoCtx& io_ctx, const std::string& journal_id,
+		   bool verbose) :
+    JournalPlayer(io_ctx, journal_id, "INSPECT"),
+    m_verbose(verbose),
+    m_s() {
+  }
+
+  int exec() {
+    int r = JournalPlayer::exec();
+    m_s.print();
+    return r;
+  }
+
+private:
+  struct Stats {
+    Stats() : total(0), error(0) {}
+
+    void print() {
+      std::cout << "Summary:" << std::endl
+		<< "  " << total << " entries inspected, " << error << " errors"
+		<< std::endl;
+    }
+
+    int total;
+    int error;
+  };
+
+  int process_entry(::journal::ReplayEntry replay_entry,
+		    std::string& tag) {
+    m_s.total++;
+    if (m_verbose) {
+      std::cout << "Entry: tag=" << tag << ", commit_tid="
+		<< replay_entry.get_commit_tid() << std::endl;
+    }
+    bufferlist data = replay_entry.get_data();
+    librbd::journal::EventEntry event_entry;
+    int r = inspect_entry(data, event_entry, m_verbose);
+    if (r < 0) {
+      m_r = r;
+      m_s.error++;
+    }
+    return 0;
+  }
+
+  bool m_verbose;
+  Stats m_s;
+};
+
+static int do_inspect_journal(librados::IoCtx& io_ctx,
+			      const std::string& journal_id,
+			      bool verbose) {
+  return JournalInspector(io_ctx, journal_id, verbose).exec();
+}
+
+struct ExportEntry {
+  std::string tag;
+  uint64_t commit_tid;
+  int type;
+  bufferlist entry;
+
+  ExportEntry() : tag(), commit_tid(0), type(0), entry() {}
+
+  ExportEntry(const std::string& tag, uint64_t commit_tid, int type,
+	      const bufferlist& entry)
+    : tag(tag), commit_tid(commit_tid), type(type), entry(entry) {
+  }
+
+  void dump(Formatter *f) const {
+    ::encode_json("tag", tag, f);
+    ::encode_json("commit_tid", commit_tid, f);
+    ::encode_json("type", type, f);
+    ::encode_json("entry", entry, f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("tag", tag, obj);
+    JSONDecoder::decode_json("commit_tid", commit_tid, obj);
+    JSONDecoder::decode_json("type", type, obj);
+    JSONDecoder::decode_json("entry", entry, obj);
+  }
+};
+
+class JournalExporter : public JournalPlayer {
+public:
+  JournalExporter(librados::IoCtx& io_ctx, const std::string& journal_id,
+		  int fd, bool no_error, bool verbose) :
+    JournalPlayer(io_ctx, journal_id, "EXPORT"),
+    m_journal_id(journal_id),
+    m_fd(fd),
+    m_no_error(no_error),
+    m_verbose(verbose),
+    m_s() {
+  }
+
+  int exec() {
+    std::string header("# journal_id: " + m_journal_id + "\n");
+    int r;
+    r = safe_write(m_fd, header.c_str(), header.size());
+    if (r < 0) {
+      std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r)
+		<< std::endl;
+      return r;
+    }
+    r = JournalPlayer::exec();
+    m_s.print();
+    return r;
+  }
+
+private:
+  struct Stats {
+    Stats() : total(0), error(0) {}
+
+    void print() {
+      std::cout << total << " entries processed, " << error << " errors"
+		<< std::endl;
+    }
+
+    int total;
+    int error;
+  };
+
+  int process_entry(::journal::ReplayEntry replay_entry,
+		    std::string& tag) {
+    m_s.total++;
+    int type = -1;
+    bufferlist entry = replay_entry.get_data();
+    librbd::journal::EventEntry event_entry;
+    int r = inspect_entry(entry, event_entry, m_verbose);
+    if (r < 0) {
+      m_s.error++;
+      m_r = r;
+      return m_no_error ? 0 : r;
+    } else {
+      type = event_entry.get_event_type();
+    }
+    ExportEntry export_entry(tag, replay_entry.get_commit_tid(), type, entry);
+    JSONFormatter f;
+    ::encode_json("event_entry", export_entry, &f);
+    std::ostringstream oss;
+    f.flush(oss);
+    std::string objstr = oss.str();
+    std::string header = stringify(objstr.size()) + " ";
+    r = safe_write(m_fd, header.c_str(), header.size());
+    if (r == 0) {
+      r = safe_write(m_fd, objstr.c_str(), objstr.size());
+    }
+    if (r == 0) {
+      r = safe_write(m_fd, "\n", 1);
+    }
+    if (r < 0) {
+      std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r)
+		<< std::endl;
+      m_s.error++;
+      return r;
+    }
+    return 0;
+  }
+
+  std::string m_journal_id;
+  int m_fd;
+  bool m_no_error;
+  bool m_verbose;
+  Stats m_s;
+};
+
+static int do_export_journal(librados::IoCtx& io_ctx,
+			     const std::string& journal_id,
+			     const std::string& path,
+			     bool no_error, bool verbose) {
+  int r;
+  int fd;
+  bool to_stdout = path == "-";
+  if (to_stdout) {
+    fd = STDOUT_FILENO;
+  } else {
+    fd = open(path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0644);
+    if (fd < 0) {
+      r = -errno;
+      std::cerr << "rbd: error creating " << path << std::endl;
+      return r;
+    }
+    posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+  }
+
+  r = JournalExporter(io_ctx, journal_id, fd, no_error, verbose).exec();
+
+  if (!to_stdout) {
+    close(fd);
+  }
+
+  return r;
+}
+
+class JournalImporter {
+public:
+  JournalImporter(librados::IoCtx& io_ctx, const std::string& journal_id,
+		  int fd, bool no_error, bool verbose) :
+    m_journaler(io_ctx, journal_id, "IMPORT"),
+    m_fd(fd),
+    m_no_error(no_error),
+    m_verbose(verbose) {
+  }
+
+  bool read_entry(bufferlist& bl, int& r) {
+    // Entries are storead in the file using the following format:
+    //
+    //   # Optional comments
+    //   NNN {json encoded entry}
+    //   ...
+    //
+    // Where NNN is the encoded entry size.
+    bl.clear();
+    char buf[80];
+    // Skip line feed and comments (lines started with #).
+    while ((r = safe_read_exact(m_fd, buf, 1)) == 0) {
+      if (buf[0] == '\n') {
+	continue;
+      } else if (buf[0] == '#') {
+	while ((r = safe_read_exact(m_fd, buf, 1)) == 0) {
+	  if (buf[0] == '\n') {
+	    break;
+	  }
+	}
+      } else {
+	break;
+      }
+    }
+    if (r < 0) {
+      if (r == -EDOM) {
+	r = 0;
+      }
+      return false;
+    }
+    // Read entry size to buf.
+    if (!isdigit(buf[0])) {
+      r = -EINVAL;
+      std::cerr << "rbd: import data invalid format (digit expected)"
+		<< std::endl;
+      return false;
+    }
+    for (size_t i = 1; i < sizeof(buf); i++) {
+      r = safe_read_exact(m_fd, buf + i, 1);
+      if (r < 0) {
+	std::cerr << "rbd: error reading import data" << std::endl;
+	return false;
+      }
+      if (!isdigit(buf[i])) {
+	if (buf[i] != ' ') {
+	  r = -EINVAL;
+	  std::cerr << "rbd: import data invalid format (space expected)"
+		    << std::endl;
+	  return false;
+	}
+	buf[i] = '\0';
+	break;
+      }
+    }
+    int entry_size = atoi(buf);
+    if (entry_size == 0) {
+      r = -EINVAL;
+      std::cerr << "rbd: import data invalid format (zero entry size)"
+		<< std::endl;
+      return false;
+    }
+    assert(entry_size > 0);
+    // Read entry.
+    r = bl.read_fd(m_fd, entry_size);
+    if (r < 0) {
+      std::cerr << "rbd: error reading from stdin: " << cpp_strerror(r)
+		<< std::endl;
+      return false;
+    }
+    if (r != entry_size) {
+      std::cerr << "rbd: error reading from stdin: trucated"
+		<< std::endl;
+      r = -EINVAL;
+      return false;
+    }
+    r = 0;
+    return true;
+  }
+
+  int exec() {
+    int r = m_journaler.init();
+    if (r < 0) {
+      return r;
+    }
+    m_journaler.start_append(0, 0, 0);
+
+    int r1 = 0;
+    bufferlist bl;
+    int n = 0;
+    int error_count = 0;
+    while (read_entry(bl, r)) {
+      n++;
+      error_count++;
+      JSONParser p;
+      if (!p.parse(bl.c_str(), bl.length())) {
+	std::cerr << "rbd: error parsing input (entry " << n << ")"
+		  << std::endl;
+	r = -EINVAL;
+	if (m_no_error) {
+	  r1 = r;
+	  continue;
+	} else {
+	  break;
+	}
+      }
+      ExportEntry e;
+      try {
+	decode_json_obj(e, &p);
+      } catch (JSONDecoder::err& err) {
+	std::cerr << "rbd: error json decoding import data (entry " << n << "):"
+		  << err.message << std::endl;
+	r = -EINVAL;
+	if (m_no_error) {
+	  r1 = r;
+	  continue;
+	} else {
+	  break;
+	}
+      }
+      librbd::journal::EventEntry event_entry;
+      r = inspect_entry(e.entry, event_entry, m_verbose);
+      if (r < 0) {
+	std::cerr << "rbd: corrupted entry " << n << ": tag=" << e.tag
+		  << ", commit_tid=" << e.commit_tid << std::endl;
+	if (m_no_error) {
+	  r1 = r;
+	  continue;
+	} else {
+	  break;
+	}
+      }
+      m_journaler.append(e.tag, e.entry);
+      error_count--;
+    }
+
+    std::cout << n << " entries processed, " << error_count << " errors"  << std::endl;
+
+    std::cout << "Waiting for journal append to complete..."  << std::endl;
+
+    C_SaferCond cond;
+    m_journaler.stop_append(&cond);
+    r = cond.wait();
+
+    if (r < 0) {
+      std::cerr << "failed to append journal: " << cpp_strerror(r) << std::endl;
+    }
+
+    if (r1 < 0 && r == 0) {
+      r = r1;
+    }
+    r1 = m_journaler.shutdown();
+    if (r1 < 0 && r == 0) {
+      r = r1;
+    }
+    return r;
+  }
+
+private:
+  Journaler m_journaler;
+  int m_fd;
+  bool m_no_error;
+  bool m_verbose;
+};
+
+static int do_import_journal(librados::IoCtx& io_ctx,
+			     const std::string& journal_id,
+			     const std::string& path,
+			     bool no_error, bool verbose) {
+  int r;
+
+  int fd;
+  bool from_stdin = path == "-";
+  if (from_stdin) {
+    fd = STDIN_FILENO;
+  } else {
+    if ((fd = open(path.c_str(), O_RDONLY)) < 0) {
+      r = -errno;
+      std::cerr << "rbd: error opening " << path << std::endl;
+      return r;
+    }
+    posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+  }
+
+  r = JournalImporter(io_ctx, journal_id, fd, no_error, verbose).exec();
+
+  if (!from_stdin) {
+    close(fd);
+  }
+
+  return r;
+}
+
+void get_info_arguments(po::options_description *positional,
+			po::options_description *options) {
+  at::add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute_info(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string journal_name;
+  int r = utils::get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE,
+					&arg_index, &pool_name, &journal_name);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_show_journal_info(rados, io_ctx, journal_name, formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: journal info: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+
+}
+
+void get_status_arguments(po::options_description *positional,
+			  po::options_description *options) {
+  at::add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string journal_name;
+  int r = utils::get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE,
+					&arg_index, &pool_name, &journal_name);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_show_journal_status(io_ctx, journal_name, formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: journal status: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_reset_arguments(po::options_description *positional,
+			 po::options_description *options) {
+  at::add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_reset(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string journal_name;
+  int r = utils::get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE,
+					&arg_index, &pool_name, &journal_name);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_reset_journal(io_ctx, journal_name);
+  if (r < 0) {
+    std::cerr << "rbd: journal reset: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_inspect_arguments(po::options_description *positional,
+			   po::options_description *options) {
+  at::add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_verbose_option(options);
+}
+
+int execute_inspect(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string journal_name;
+  int r = utils::get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE,
+					&arg_index, &pool_name, &journal_name);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_inspect_journal(io_ctx, journal_name, vm[at::VERBOSE].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: journal inspect: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_export_arguments(po::options_description *positional,
+			  po::options_description *options) {
+  at::add_journal_spec_options(positional, options,
+			       at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_path_options(positional, options,
+                       "export file (or '-' for stdout)");
+  at::add_verbose_option(options);
+  at::add_no_error_option(options);
+}
+
+int execute_export(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string journal_name;
+  int r = utils::get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_SOURCE,
+					&arg_index, &pool_name, &journal_name);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string path;
+  r = utils::get_path(vm, utils::get_positional_argument(vm, 1), &path);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_export_journal(io_ctx, journal_name, path, vm[at::NO_ERROR].as<bool>(),
+			vm[at::VERBOSE].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: journal export: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_import_arguments(po::options_description *positional,
+			  po::options_description *options) {
+  at::add_path_options(positional, options,
+                       "import file (or '-' for stdin)");
+  at::add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+  at::add_verbose_option(options);
+  at::add_no_error_option(options);
+}
+
+int execute_import(const po::variables_map &vm) {
+  std::string path;
+  int r = utils::get_path(vm, utils::get_positional_argument(vm, 0), &path);
+  if (r < 0) {
+    return r;
+  }
+
+  size_t arg_index = 1;
+  std::string pool_name;
+  std::string journal_name;
+  r = utils::get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_DEST,
+				    &arg_index, &pool_name, &journal_name);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_import_journal(io_ctx, journal_name, path, vm[at::NO_ERROR].as<bool>(),
+			vm[at::VERBOSE].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: journal export: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action_info(
+  {"journal", "info"}, {}, "Show information about image journal.", "",
+  &get_info_arguments, &execute_info);
+
+Shell::Action action_status(
+  {"journal", "status"}, {}, "Show status of image journal.", "",
+  &get_status_arguments, &execute_status);
+
+Shell::Action action_reset(
+  {"journal", "reset"}, {}, "Reset image journal.", "",
+  &get_reset_arguments, &execute_reset);
+
+Shell::Action action_inspect(
+  {"journal", "inspect"}, {}, "Inspect image journal for structural errors.", "",
+  &get_inspect_arguments, &execute_inspect);
+
+Shell::Action action_export(
+  {"journal", "export"}, {}, "Export image journal.", "",
+  &get_export_arguments, &execute_export);
+
+Shell::Action action_import(
+  {"journal", "import"}, {}, "Import image journal.", "",
+  &get_import_arguments, &execute_import);
+
+} // namespace journal
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/List.cc b/src/tools/rbd/action/List.cc
index 6b2041a..45ca812 100644
--- a/src/tools/rbd/action/List.cc
+++ b/src/tools/rbd/action/List.cc
@@ -83,15 +83,17 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
     parent.clear();
     r = im.parent_info(&pool, &image, &snap);
     if (r < 0 && r != -ENOENT)
-      return r;
+      goto out;
     bool has_parent = false;
     if (r != -ENOENT) {
       parent = pool + "/" + image + "@" + snap;
       has_parent = true;
     }
 
-    if (im.stat(info, sizeof(info)) < 0)
-      return -EINVAL;
+    if (im.stat(info, sizeof(info)) < 0) {
+      r = -EINVAL;
+      goto out;
+    }
 
     uint8_t old_format;
     im.old_format(&old_format);
@@ -100,7 +102,7 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
     bool exclusive;
     r = im.list_lockers(&lockers, &exclusive, NULL);
     if (r < 0)
-      return r;
+      goto out;
     std::string lockstr;
     if (!lockers.empty()) {
       lockstr = (exclusive) ? "excl" : "shr";
@@ -141,7 +143,7 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
         im.snap_set(s->name.c_str());
         r = im.snap_is_protected(s->name.c_str(), &is_protected);
         if (r < 0)
-          return r;
+          goto out;
         if (im.parent_info(&pool, &image, &snap) >= 0) {
           parent = pool + "/" + image + "@" + snap;
           has_parent = true;
@@ -173,6 +175,8 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
       }
     }
   }
+
+out:
   if (f) {
     f->close_section();
     f->flush(std::cout);
@@ -180,28 +184,20 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
     std::cout << tbl;
   }
 
-  return 0;
+  return r < 0 ? r : 0;
 }
 
 void get_arguments(po::options_description *positional,
                    po::options_description *options) {
-  positional->add_options()
-    ("pool-name", "pool name");
   options->add_options()
-    ("long,l", po::bool_switch(), "long listing format")
-    ("pool,p", po::value<std::string>(), "pool name");
+    ("long,l", po::bool_switch(), "long listing format");
+  at::add_pool_options(positional, options);
   at::add_format_options(options);
 }
 
 int execute(const po::variables_map &vm) {
-  std::string pool_name = utils::get_positional_argument(vm, 0);
-  if (pool_name.empty() && vm.count("pool")) {
-    pool_name = vm["pool"].as<std::string>();
-  }
-
-  if (pool_name.empty()) {
-    pool_name = at::DEFAULT_POOL_NAME;
-  }
+  size_t arg_index = 0;
+  std::string pool_name = utils::get_pool_name(vm, &arg_index);
 
   at::Format::Formatter formatter;
   int r = utils::get_formatter(vm, &formatter);
diff --git a/src/tools/rbd/action/MergeDiff.cc b/src/tools/rbd/action/MergeDiff.cc
index 9e08a37..eb3f3a3 100644
--- a/src/tools/rbd/action/MergeDiff.cc
+++ b/src/tools/rbd/action/MergeDiff.cc
@@ -1,6 +1,10 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#define _LARGEFILE64_SOURCE
+#include <sys/types.h>
+#include <unistd.h>
+
 #include "tools/rbd/ArgumentTypes.h"
 #include "tools/rbd/Shell.h"
 #include "tools/rbd/Utils.h"
@@ -336,7 +340,8 @@ static int do_merge_diff(const char *first, const char *second,
           bufferptr bp = buffer::create(delta);
           r = safe_read_exact(fd, bp.c_str(), delta);
         } else {
-          r = lseek(fd, delta, SEEK_CUR);
+          off64_t l = lseek64(fd, delta, SEEK_CUR);
+          r = l < 0 ? -errno : 0;
         }
         if (r < 0) {
           std::cerr << "rbd: failed to skip first diff data" << std::endl;
diff --git a/src/tools/rbd/action/MirrorPool.cc b/src/tools/rbd/action/MirrorPool.cc
new file mode 100644
index 0000000..4d37ec1
--- /dev/null
+++ b/src/tools/rbd/action/MirrorPool.cc
@@ -0,0 +1,421 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/regex.hpp>
+
+namespace rbd {
+namespace action {
+namespace mirror_pool {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+int init_remote(const std::string &config_path, const std::string &client_name,
+                const std::string &cluster_name, const std::string &pool_name,
+                librados::Rados *rados, librados::IoCtx *io_ctx) {
+  int r = rados->init2(client_name.c_str(), cluster_name.c_str(), 0);
+  if (r < 0) {
+    std::cerr << "rbd: couldn't initialize remote rados!" << std::endl;
+    return r;
+  }
+
+  r = rados->conf_read_file(config_path.empty() ? nullptr :
+                                                  config_path.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: couldn't read remote configuration" << std::endl;
+    return r;
+  }
+
+  r = rados->connect();
+  if (r < 0) {
+    std::cerr << "rbd: couldn't connect to the remote cluster!" << std::endl;
+    return r;
+  }
+
+  if (io_ctx != nullptr) {
+    r = utils::init_io_ctx(*rados, pool_name, io_ctx);
+    if (r < 0) {
+      return r;
+    }
+  }
+  return 0;
+}
+
+int validate_uuid(const std::string &uuid) {
+  boost::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$",
+                       boost::regex::icase);
+  boost::smatch match;
+  if (!boost::regex_match(uuid, match, pattern)) {
+    std::cerr << "rbd: invalid uuid '" << uuid << "'" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void add_cluster_uuid_option(po::options_description *positional) {
+  positional->add_options()
+    ("cluster-uuid", po::value<std::string>(), "cluster UUID");
+}
+
+int get_cluster_uuid(const po::variables_map &vm, size_t arg_index,
+                     std::string *cluster_uuid) {
+  *cluster_uuid = utils::get_positional_argument(vm, arg_index);
+  if (cluster_uuid->empty()) {
+    std::cerr << "rbd: must specify cluster uuid" << std::endl;
+    return -EINVAL;
+  }
+  return validate_uuid(*cluster_uuid);
+}
+
+int get_remote_cluster_spec(const po::variables_map &vm,
+                            const std::string &spec,
+                            std::string *remote_client_name,
+                            std::string *remote_cluster,
+                            std::string *remote_cluster_uuid) {
+  if (vm.count("remote-client-name")) {
+    *remote_client_name = vm["remote-client-name"].as<std::string>();
+  }
+  if (vm.count("remote-cluster")) {
+    *remote_cluster = vm["remote-cluster"].as<std::string>();
+  }
+  if (vm.count("remote-cluster-uuid")) {
+    *remote_cluster_uuid = vm["remote-cluster-uuid"].as<std::string>();
+    int r = validate_uuid(*remote_cluster_uuid);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (!spec.empty()) {
+    boost::regex pattern("^(?:(client\\.[^@]+)@)?([^/@]+)$");
+    boost::smatch match;
+    if (!boost::regex_match(spec, match, pattern)) {
+      std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl;
+      return -EINVAL;
+    }
+    if (match[1].matched) {
+      *remote_client_name = match[1];
+    }
+    *remote_cluster = match[2];
+  }
+
+  if (remote_cluster->empty()) {
+    std::cerr << "rbd: remote cluster was not specified" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void format_mirror_peers(const std::string &config_path,
+                         at::Format::Formatter formatter,
+                         const std::vector<librbd::mirror_peer_t> &peers) {
+  if (formatter != nullptr) {
+    formatter->open_array_section("peers");
+    for (auto &peer : peers) {
+      formatter->open_object_section("peer");
+      formatter->dump_string("cluster_uuid", peer.cluster_uuid);
+      formatter->dump_string("cluster_name", peer.cluster_name);
+      formatter->dump_string("client_name", peer.client_name);
+      formatter->close_section();
+    }
+    formatter->close_section();
+  } else {
+    std::cout << "Peers: ";
+    if (peers.empty()) {
+      std::cout << "none" << std::endl;
+    } else {
+      TextTable tbl;
+      tbl.define_column("", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("UUID", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("CLIENT", TextTable::LEFT, TextTable::LEFT);
+      for (auto &peer : peers) {
+        tbl << " "
+            << peer.cluster_uuid
+            << peer.cluster_name
+            << peer.client_name
+            << TextTable::endrow;
+      }
+      std::cout << std::endl << tbl;
+    }
+  }
+}
+
+} // anonymous namespace
+
+void get_peer_add_arguments(po::options_description *positional,
+                            po::options_description *options) {
+  at::add_pool_options(positional, options);
+  positional->add_options()
+    ("remote-cluster-spec", "remote cluster spec\n"
+     "(example: [<client name>@]<cluster name>");
+  options->add_options()
+    ("remote-client-name", po::value<std::string>(), "remote client name")
+    ("remote-cluster", po::value<std::string>(), "remote cluster name")
+    ("remote-cluster-uuid", po::value<std::string>(), "remote cluster uuid");
+}
+
+int execute_peer_add(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name = utils::get_pool_name(vm, &arg_index);
+
+  std::string remote_client_name = g_ceph_context->_conf->name.to_str();
+  std::string remote_cluster;
+  std::string remote_cluster_uuid;
+  int r = get_remote_cluster_spec(
+    vm, utils::get_positional_argument(vm, arg_index),
+    &remote_client_name, &remote_cluster, &remote_cluster_uuid);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string config_path;
+  if (vm.count(at::CONFIG_PATH)) {
+    config_path = vm[at::CONFIG_PATH].as<std::string>();
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  if (remote_cluster_uuid.empty()) {
+    librados::Rados remote_rados;
+    librados::IoCtx remote_io_ctx;
+    r = init_remote(config_path, remote_client_name, remote_cluster,
+                    pool_name, &remote_rados, &remote_io_ctx);
+    if (r < 0) {
+      return r;
+    }
+
+    r = remote_rados.cluster_fsid(&remote_cluster_uuid);
+    if (r < 0) {
+      std::cerr << "rbd: error retrieving remote cluster id" << std::endl;
+      return r;
+    }
+  }
+
+  librbd::RBD rbd;
+  r = rbd.mirror_peer_add(io_ctx, remote_cluster_uuid, remote_cluster,
+                          remote_client_name);
+  if (r < 0) {
+    std::cerr << "rbd: error adding mirror peer" << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_peer_remove_arguments(po::options_description *positional,
+                               po::options_description *options) {
+  at::add_pool_options(positional, options);
+  add_cluster_uuid_option(positional);
+}
+
+int execute_peer_remove(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name = utils::get_pool_name(vm, &arg_index);
+
+  std::string cluster_uuid;
+  int r = get_cluster_uuid(vm, arg_index, &cluster_uuid);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = rbd.mirror_peer_remove(io_ctx, cluster_uuid);
+  if (r < 0) {
+    std::cerr << "rbd: error removing mirror peer" << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_peer_set_arguments(po::options_description *positional,
+                            po::options_description *options) {
+  at::add_pool_options(positional, options);
+  add_cluster_uuid_option(positional);
+  positional->add_options()
+    ("key", "peer parameter [client or cluster]")
+    ("value", "new client or cluster name");
+}
+
+int execute_peer_set(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name = utils::get_pool_name(vm, &arg_index);
+
+  std::string cluster_uuid;
+  int r = get_cluster_uuid(vm, arg_index++, &cluster_uuid);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string key = utils::get_positional_argument(vm, arg_index++);
+  if (key != "client" && key != "cluster") {
+    std::cerr << "rbd: must specify 'client' or 'cluster' key." << std::endl;
+    return -EINVAL;
+  }
+
+  std::string value = utils::get_positional_argument(vm, arg_index++);
+  if (value.empty()) {
+    std::cerr << "rbd: must specify new " << key << " value." << std::endl;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  if (key == "client") {
+    r = rbd.mirror_peer_set_client(io_ctx, cluster_uuid.c_str(), value.c_str());
+  } else {
+    r = rbd.mirror_peer_set_cluster(io_ctx, cluster_uuid.c_str(),
+                                    value.c_str());
+  }
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+void get_enable_disable_arguments(po::options_description *positional,
+                                  po::options_description *options) {
+  at::add_pool_options(positional, options);
+}
+
+int execute_enable_disable(const po::variables_map &vm, bool enabled) {
+  size_t arg_index = 0;
+  std::string pool_name = utils::get_pool_name(vm, &arg_index);
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  int r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = rbd.mirror_set_enabled(io_ctx, enabled);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+int execute_disable(const po::variables_map &vm) {
+  return execute_enable_disable(vm, false);
+}
+
+int execute_enable(const po::variables_map &vm) {
+  return execute_enable_disable(vm, true);
+}
+
+void get_info_arguments(po::options_description *positional,
+                        po::options_description *options) {
+  at::add_pool_options(positional, options);
+  at::add_format_options(options);
+}
+
+int execute_info(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name = utils::get_pool_name(vm, &arg_index);
+
+  at::Format::Formatter formatter;
+  int r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string config_path;
+  if (vm.count(at::CONFIG_PATH)) {
+    config_path = vm[at::CONFIG_PATH].as<std::string>();
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  bool enabled;
+  r = rbd.mirror_is_enabled(io_ctx, &enabled);
+  if (r < 0) {
+    return r;
+  }
+
+  std::vector<librbd::mirror_peer_t> mirror_peers;
+  r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
+  if (r < 0) {
+    return r;
+  }
+
+  if (formatter != nullptr) {
+    formatter->open_object_section("mirror");
+    formatter->dump_bool("enabled", enabled);
+  } else {
+    std::cout << "Enabled: " << (enabled ? "true" : "false") << std::endl;
+  }
+
+  format_mirror_peers(config_path, formatter, mirror_peers);
+  if (formatter != nullptr) {
+    formatter->close_section();
+    formatter->flush(std::cout);
+  }
+  return 0;
+}
+
+Shell::Action action_add(
+  {"mirror", "pool", "peer", "add"}, {},
+  "Add a mirroring peer to a pool.", "",
+  &get_peer_add_arguments, &execute_peer_add);
+Shell::Action action_remove(
+  {"mirror", "pool", "peer", "remove"}, {},
+  "Remove a mirroring peer from a pool.", "",
+  &get_peer_remove_arguments, &execute_peer_remove);
+Shell::Action action_set(
+  {"mirror", "pool", "peer", "set"}, {},
+  "Update mirroring peer settings.", "",
+  &get_peer_set_arguments, &execute_peer_set);
+
+Shell::Action action_disable(
+  {"mirror", "pool", "disable"}, {},
+  "Disable RBD mirroring by default within a pool.", "",
+  &get_enable_disable_arguments, &execute_disable);
+Shell::Action action_enable(
+  {"mirror", "pool", "enable"}, {},
+  "Enable RBD mirroring by default within a pool.", "",
+  &get_enable_disable_arguments, &execute_enable);
+Shell::Action action_info(
+  {"mirror", "pool", "info"}, {},
+  "Show information about the pool mirroring configuration.", {},
+  &get_info_arguments, &execute_info);
+
+} // namespace mirror_pool
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Nbd.cc b/src/tools/rbd/action/Nbd.cc
new file mode 100644
index 0000000..be42173
--- /dev/null
+++ b/src/tools/rbd/action/Nbd.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/SubProcess.h"
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/scope_exit.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace nbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int call_nbd_cmd(const po::variables_map &vm,
+                        const std::vector<const char*> &args)
+{
+  char exe_path[PATH_MAX];
+  ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path,
+				    sizeof(exe_path) - 1);
+  if (exe_path_bytes < 0) {
+    strcpy(exe_path, "rbd-nbd");
+  } else {
+    if (snprintf(exe_path + exe_path_bytes,
+                 sizeof(exe_path) - exe_path_bytes,
+                 "-nbd") < 0) {
+      return -EOVERFLOW;
+    }
+  }
+
+  SubProcess process(exe_path, SubProcess::KEEP, SubProcess::KEEP, SubProcess::KEEP);
+
+  if (vm.count("conf")) {
+    process.add_cmd_arg("--conf");
+    process.add_cmd_arg(vm["conf"].as<std::string>().c_str());
+  }
+  if (vm.count("cluster")) {
+    process.add_cmd_arg("--cluster");
+    process.add_cmd_arg(vm["cluster"].as<std::string>().c_str());
+  }
+  if (vm.count("id")) {
+    process.add_cmd_arg("--id");
+    process.add_cmd_arg(vm["id"].as<std::string>().c_str());
+  }
+  if (vm.count("name")) {
+    process.add_cmd_arg("--name");
+    process.add_cmd_arg(vm["name"].as<std::string>().c_str());
+  }
+  if (vm.count("mon_host")) {
+    process.add_cmd_arg("--mon_host");
+    process.add_cmd_arg(vm["mon_host"].as<std::string>().c_str());
+  }
+  if (vm.count("keyfile")) {
+    process.add_cmd_arg("--keyfile");
+    process.add_cmd_arg(vm["keyfile"].as<std::string>().c_str());
+  }
+  if (vm.count("keyring")) {
+    process.add_cmd_arg("--keyring");
+    process.add_cmd_arg(vm["keyring"].as<std::string>().c_str());
+  }
+
+  for (std::vector<const char*>::const_iterator p = args.begin();
+       p != args.end(); p++)
+    process.add_cmd_arg(*p);
+
+  if (process.spawn()) {
+    std::cerr << "rbd: failed to run rbd-nbd: " << process.err() << std::endl;
+    return -EINVAL;
+  } else if (process.join()) {
+    std::cerr << "rbd: rbd-nbd failed with error: " << process.err() << std::endl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void get_show_arguments(po::options_description *positional,
+                        po::options_description *options)
+{ }
+
+int execute_show(const po::variables_map &vm)
+{
+  std::vector<const char*> args;
+
+  args.push_back("list-mapped");
+
+  return call_nbd_cmd(vm, args);
+}
+
+void get_map_arguments(po::options_description *positional,
+                       po::options_description *options)
+{
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_NONE);
+  options->add_options()
+    ("read-only", po::bool_switch(), "mount read-only")
+    ("device", po::value<std::string>(), "specify nbd device");
+}
+
+int execute_map(const po::variables_map &vm)
+{
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  std::vector<const char*> args;
+
+  args.push_back("map");
+  std::string img;
+  img.append(pool_name);
+  img.append("/");
+  img.append(image_name);
+  if (!snap_name.empty()) {
+    img.append("@");
+    img.append(snap_name);
+  }
+  args.push_back(img.c_str());
+
+  if (vm["read-only"].as<bool>())
+    args.push_back("--read-only");
+
+  if (vm.count("device")) {
+    args.push_back("--device");
+    args.push_back(vm["device"].as<std::string>().c_str());
+  }
+
+  return call_nbd_cmd(vm, args);
+}
+
+void get_unmap_arguments(po::options_description *positional,
+                   po::options_description *options)
+{
+  positional->add_options()
+    ("device-spec", "specify nbd device");
+}
+
+int execute_unmap(const po::variables_map &vm)
+{
+  std::string device_name = utils::get_positional_argument(vm, 0);
+  if (!boost::starts_with(device_name, "/dev/")) {
+    device_name.clear();
+  }
+
+  if (device_name.empty()) {
+    std::cerr << "rbd: nbd unmap requires device path" << std::endl;
+    return -EINVAL;
+  }
+
+  std::vector<const char*> args;
+
+  args.push_back("unmap");
+  args.push_back(device_name.c_str());
+
+  return call_nbd_cmd(vm, args);
+}
+
+Shell::SwitchArguments switched_arguments({"read-only"});
+
+Shell::Action action_show(
+  {"nbd", "list"}, {"nbd", "ls"}, "List the nbd devices already used.", "",
+  &get_show_arguments, &execute_show);
+
+Shell::Action action_map(
+  {"nbd", "map"}, {}, "Map image to a nbd device.", "",
+  &get_map_arguments, &execute_map);
+
+Shell::Action action_unmap(
+  {"nbd", "unmap"}, {}, "Unmap a nbd device.", "",
+  &get_unmap_arguments, &execute_unmap);
+
+} // namespace nbd
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc
new file mode 100644
index 0000000..b86bceb
--- /dev/null
+++ b/src/tools/rbd_nbd/rbd-nbd.cc
@@ -0,0 +1,739 @@
+#include "include/int_types.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <linux/nbd.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include <iostream>
+#include <boost/regex.hpp>
+
+#include "mon/MonClient.h"
+#include "common/config.h"
+
+#include "common/errno.h"
+#include "common/module.h"
+#include "common/safe_io.h"
+#include "common/ceph_argparse.h"
+#include "common/Preforker.h"
+#include "global/global_init.h"
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+
+static void usage()
+{
+  std::cout << "Usage: rbd-nbd [options] map <image-or-snap-spec>  Map a image to nbd device\n"
+            << "               unmap <device path>                 Unmap nbd device\n"
+            << "               list-mapped                         List mapped nbd devices\n"
+            << "Options: --device <device path>                    Specify nbd device path\n"
+            << "         --read-only                               Map readonly\n"
+            << "         --nbds_max <limit>                        Override for module param\n"
+            << std::endl;
+}
+
+static Preforker forker;
+static std::string devpath, poolname("rbd"), imgname, snapname;
+static bool readonly = false;
+static int nbds_max = 0;
+
+#ifdef CEPH_BIG_ENDIAN
+#define ntohll(a) (a)
+#elif defined(CEPH_LITTLE_ENDIAN)
+#define ntohll(a) swab64(a)
+#else
+#error "Could not determine endianess"
+#endif
+#define htonll(a) ntohll(a)
+
+class NBDServer
+{
+private:
+  int fd;
+  librbd::Image ℑ
+
+public:
+  NBDServer(int _fd, librbd::Image& _image)
+    : fd(_fd)
+    , image(_image)
+    , terminated(false)
+    , lock("NBDServer::Locker")
+    , reader_thread(*this, &NBDServer::reader_entry)
+    , writer_thread(*this, &NBDServer::writer_entry)
+    , started(false)
+  {}
+
+private:
+  atomic_t terminated;
+
+  void shutdown()
+  {
+    if (terminated.compare_and_swap(false, true)) {
+      ::shutdown(fd, SHUT_RDWR);
+
+      Mutex::Locker l(lock);
+      cond.Signal();
+    }
+  }
+
+  struct IOContext
+  {
+    xlist<IOContext*>::item item;
+    NBDServer *server;
+    struct nbd_request request;
+    struct nbd_reply reply;
+    bufferlist data;
+    int command;
+
+    IOContext()
+      : item(this)
+    {}
+  };
+
+  Mutex lock;
+  Cond cond;
+  xlist<IOContext*> io_pending;
+  xlist<IOContext*> io_finished;
+
+  void io_start(IOContext *ctx)
+  {
+    Mutex::Locker l(lock);
+    io_pending.push_back(&ctx->item);
+  }
+
+  void io_finish(IOContext *ctx)
+  {
+    Mutex::Locker l(lock);
+    assert(ctx->item.is_on_list());
+    ctx->item.remove_myself();
+    io_finished.push_back(&ctx->item);
+    cond.Signal();
+  }
+
+  IOContext *wait_io_finish()
+  {
+    Mutex::Locker l(lock);
+    while(io_finished.empty() && !terminated.read())
+      cond.Wait(lock);
+
+    if (io_finished.empty())
+      return NULL;
+
+    IOContext *ret = io_finished.front();
+    io_finished.pop_front();
+
+    return ret;
+  }
+
+  void wait_clean()
+  {
+    assert(!reader_thread.is_started());
+    Mutex::Locker l(lock);
+    while(!io_pending.empty())
+      cond.Wait(lock);
+
+    while(!io_finished.empty()) {
+      ceph::unique_ptr<IOContext> free_ctx(io_finished.front());
+      io_finished.pop_front();
+    }
+  }
+
+  static void aio_callback(librbd::completion_t cb, void *arg)
+  {
+    librbd::RBD::AioCompletion *aio_completion =
+    reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
+
+    IOContext *ctx = reinterpret_cast<IOContext *>(arg);
+    int ret = aio_completion->get_return_value();
+    if (ret > 0)
+      ret = 0;
+    ctx->reply.error = htonl(ret);
+    ctx->server->io_finish(ctx);
+
+    aio_completion->release();
+  }
+
+  void reader_entry()
+  {
+    while (!terminated.read()) {
+      ceph::unique_ptr<IOContext> ctx(new IOContext());
+      ctx->server = this;
+      if (safe_read_exact(fd, &ctx->request, sizeof(struct nbd_request)) < 0)
+        return;
+
+      if (ctx->request.magic != htonl(NBD_REQUEST_MAGIC))
+        return;
+
+      ctx->request.from = ntohll(ctx->request.from);
+      ctx->request.type = ntohl(ctx->request.type);
+      ctx->request.len = ntohl(ctx->request.len);
+
+      ctx->reply.magic = htonl(NBD_REPLY_MAGIC);
+      memcpy(ctx->reply.handle, ctx->request.handle, sizeof(ctx->reply.handle));
+
+      ctx->command = ctx->request.type & 0x0000ffff;
+
+      switch (ctx->command)
+      {
+        case NBD_CMD_DISC:
+          return;
+        case NBD_CMD_WRITE:
+          bufferptr ptr(ctx->request.len);
+          if (safe_read_exact(fd, ptr.c_str(), ctx->request.len) < 0)
+            return;
+          ctx->data.push_back(ptr);
+          break;
+      }
+
+      IOContext *pctx = ctx.release();
+      io_start(pctx);
+      librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(pctx, aio_callback);
+      switch (pctx->command)
+      {
+        case NBD_CMD_WRITE:
+          image.aio_write(pctx->request.from, pctx->request.len, pctx->data, c);
+          break;
+        case NBD_CMD_READ:
+          image.aio_read(pctx->request.from, pctx->request.len, pctx->data, c);
+          break;
+        case NBD_CMD_FLUSH:
+          image.aio_flush(c);
+          break;
+        case NBD_CMD_TRIM:
+          image.aio_discard(pctx->request.from, pctx->request.len, c);
+          break;
+        default:
+          return;
+      }
+    }
+  }
+
+  void writer_entry()
+  {
+    while (!terminated.read()) {
+      ceph::unique_ptr<IOContext> ctx(wait_io_finish());
+      if (!ctx)
+        return;
+
+      if (safe_write(fd, &ctx->reply, sizeof(struct nbd_reply)) < 0)
+        return;
+      if (ctx->command == NBD_CMD_READ && ctx->reply.error == htonl(0)) {
+        if (ctx->data.write_fd(fd) < 0)
+          return;
+      }
+    }
+  }
+
+  class ThreadHelper : public Thread
+  {
+  public:
+    typedef void (NBDServer::*entry_func)();
+  private:
+    NBDServer &server;
+    entry_func func;
+  public:
+    ThreadHelper(NBDServer &_server, entry_func _func)
+      :server(_server)
+      ,func(_func)
+    {}
+  protected:
+    virtual void* entry()
+    {
+      (server.*func)();
+      server.shutdown();
+      return NULL;
+    }
+  } reader_thread, writer_thread;
+
+  bool started;
+public:
+  void start()
+  {
+    if (!started) {
+      started = true;
+
+      reader_thread.create();
+      writer_thread.create();
+    }
+  }
+
+  void stop()
+  {
+    if (started) {
+      shutdown();
+
+      reader_thread.join();
+      writer_thread.join();
+
+      wait_clean();
+
+      started = false;
+    }
+  }
+
+  ~NBDServer()
+  {
+    stop();
+  }
+};
+
+
+class NBDWatchCtx : public librados::WatchCtx2
+{
+private:
+  int fd;
+  librados::IoCtx &io_ctx;
+  librbd::Image ℑ
+  std::string header_oid;
+  unsigned long size;
+public:
+  NBDWatchCtx(int _fd,
+              librados::IoCtx &_io_ctx,
+              librbd::Image &_image,
+              std::string &_header_oid,
+              unsigned long _size)
+    : fd(_fd)
+    , io_ctx(_io_ctx)
+    , image(_image)
+    , header_oid(_header_oid)
+    , size(_size)
+  { }
+
+  virtual ~NBDWatchCtx() {}
+
+  virtual void handle_notify(uint64_t notify_id,
+                             uint64_t cookie,
+                             uint64_t notifier_id,
+                             bufferlist& bl)
+  {
+    librbd::image_info_t info;
+    if (image.stat(info, sizeof(info)) == 0) {
+      unsigned long new_size = info.size;
+
+      if (new_size != size) {
+        if (ioctl(fd, BLKFLSBUF, NULL) < 0)
+          std::cerr << "rbd-nbd: invalidate page cache failed status: " << cpp_strerror(errno) << std::endl;
+        if (ioctl(fd, NBD_SET_SIZE, new_size) < 0)
+          std::cerr << "rbd-nbd: resize failed status: " << cpp_strerror(errno) << std::endl;
+        if (image.invalidate_cache() < 0)
+          std::cerr << "rbd-nbd: invalidate rbd cache failed" << std::endl;
+        size = new_size;
+      }
+    }
+
+    bufferlist reply;
+    io_ctx.notify_ack(header_oid, notify_id, cookie, reply);
+  }
+
+  virtual void handle_error(uint64_t cookie, int err)
+  {
+    //ignore
+  }
+};
+
+static int open_device(const char* path, bool try_load_moudle = false)
+{
+  int nbd = open(path, O_RDWR);
+  if (nbd < 0 && try_load_moudle && access("/sys/module/nbd", F_OK) != 0) {
+    int r;
+    if (nbds_max) {
+      ostringstream param;
+      param << "nbds_max=" << nbds_max;
+      r = module_load("nbd", param.str().c_str());
+    } else {
+      r = module_load("nbd", NULL);
+    }
+    if (r < 0) {
+      cerr << "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-r) << std::endl;
+      return r;
+    }
+    nbd = open(path, O_RDWR);
+  }
+  return nbd;
+}
+
+static int do_map()
+{
+  int r;
+
+  librados::Rados rados;
+  librbd::RBD rbd;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+
+  int read_only;
+  unsigned long flags;
+  unsigned long size;
+
+  int fd[2];
+  int nbd;
+  int null_fd = -1;
+
+  uint8_t old_format;
+  librbd::image_info_t info;
+
+  if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1) {
+    r = -errno;
+    goto close_ret;
+  }
+
+  if (devpath.empty()) {
+    char dev[64];
+    int index = 0;
+    while (true) {
+      snprintf(dev, sizeof(dev), "/dev/nbd%d", index);
+
+      nbd = open_device(dev, true);
+      if (nbd < 0) {
+        r = nbd;
+        cerr << "rbd-nbd: failed to find unused device" << std::endl;
+        goto close_fd;
+      }
+
+      r = ioctl(nbd, NBD_SET_SOCK, fd[0]);
+      if (r < 0) {
+        close(nbd);
+        ++index;
+        continue;
+      }
+
+      devpath = dev;
+      break;
+    }
+  } else {
+    nbd = open_device(devpath.c_str(), true);
+    if (nbd < 0) {
+      r = nbd;
+      cerr << "rbd-nbd: failed to open device: " << devpath << std::endl;
+      goto close_fd;
+    }
+
+    r = ioctl(nbd, NBD_SET_SOCK, fd[0]);
+    if (r < 0) {
+      r = -errno;
+      cerr << "rbd-nbd: the device " << devpath << " is busy" << std::endl;
+      close(nbd);
+      goto close_fd;
+    }
+  }
+
+  flags = NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_TRIM | NBD_FLAG_HAS_FLAGS;
+  if (!snapname.empty() || readonly)
+    flags |= NBD_FLAG_READ_ONLY;
+
+  r = rados.init_with_context(g_ceph_context);
+  if (r < 0)
+    goto close_nbd;
+
+  r = rados.connect();
+  if (r < 0)
+    goto close_nbd;
+
+  r = rados.ioctx_create(poolname.c_str(), io_ctx);
+  if (r < 0)
+    goto close_nbd;
+
+  r = rbd.open(io_ctx, image, imgname.c_str());
+  if (r < 0)
+    goto close_nbd;
+
+  if (!snapname.empty()) {
+    r = image.snap_set(snapname.c_str());
+    if (r < 0)
+      goto close_nbd;
+  }
+
+  r = image.stat(info, sizeof(info));
+  if (r < 0)
+    goto close_nbd;
+
+  r = ioctl(nbd, NBD_SET_BLKSIZE, 512UL);
+  if (r < 0) {
+    r = -errno;
+    goto close_nbd;
+  }
+
+  size = info.size;
+  r = ioctl(nbd, NBD_SET_SIZE, size);
+  if (r < 0) {
+    r = -errno;
+    goto close_nbd;
+  }
+
+  ioctl(nbd, NBD_SET_FLAGS, flags);
+
+  read_only = snapname.empty() ? 0 : 1;
+  r = ioctl(nbd, BLKROSET, (unsigned long) &read_only);
+  if (r < 0) {
+    r = -errno;
+    goto close_nbd;
+  }
+
+  r = image.old_format(&old_format);
+  if (r < 0)
+    goto close_nbd;
+
+  {
+    string header_oid;
+    uint64_t watcher;
+
+    if (old_format != 0) {
+      header_oid = imgname + RBD_SUFFIX;
+    } else {
+      char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+      strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+      prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+      std::string image_id(prefix + strlen(RBD_DATA_PREFIX));
+      header_oid = RBD_HEADER_PREFIX + image_id;
+    }
+
+    NBDWatchCtx watch_ctx(nbd, io_ctx, image, header_oid, info.size);
+    r = io_ctx.watch2(header_oid, &watcher, &watch_ctx);
+    if (r < 0)
+      goto close_nbd;
+
+    if (g_conf->daemonize) {
+      r = open("/dev/null", O_RDWR);
+      if (r < 0)
+        goto close_watcher;
+      null_fd = r;
+    }
+
+    cout << devpath << std::endl;
+
+    if (g_conf->daemonize) {
+      forker.daemonize();
+
+      ::dup2(null_fd, STDIN_FILENO);
+      ::dup2(null_fd, STDOUT_FILENO);
+      ::dup2(null_fd, STDERR_FILENO);
+      close(null_fd);
+    }
+
+    {
+      NBDServer server(fd[1], image);
+
+      server.start();
+      ioctl(nbd, NBD_DO_IT);
+      server.stop();
+    }
+
+close_watcher:
+    io_ctx.unwatch2(watcher);
+  }
+
+close_nbd:
+  if (r < 0) {
+    ioctl(nbd, NBD_CLEAR_SOCK);
+    cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) << std::endl;
+  }
+  close(nbd);
+close_fd:
+  close(fd[0]);
+  close(fd[1]);
+close_ret:
+  image.close();
+  io_ctx.close();
+  rados.shutdown();
+  return r;
+}
+
+static int do_unmap()
+{
+  int nbd = open_device(devpath.c_str());
+  if (nbd < 0) {
+    cerr << "rbd-nbd: failed to open device: " << devpath << std::endl;
+    return nbd;
+  }
+
+  if (ioctl(nbd, NBD_DISCONNECT) < 0)
+    cerr << "rbd-nbd: the device is not used" << std::endl;
+  ioctl(nbd, NBD_CLEAR_SOCK);
+  close(nbd);
+
+  return 0;
+}
+
+static int parse_imgpath(const std::string &imgpath)
+{
+  boost::regex pattern("^(?:([^/@]+)/)?([^/@]+)(?:@([^/@]+))?$");
+  boost::smatch match;
+  if (!boost::regex_match(imgpath, match, pattern)) {
+    std::cerr << "rbd-nbd: invalid spec '" << imgpath << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  if (match[1].matched)
+    poolname = match[1];
+
+  imgname = match[2];
+
+  if (match[3].matched)
+    snapname = match[3];
+
+  return 0;
+}
+
+static void list_mapped_devices()
+{
+  char path[64];
+  int m = 0;
+  int fd[2];
+
+  if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1)
+    return;
+
+  while (true) {
+    snprintf(path, sizeof(path), "/dev/nbd%d", m);
+    int nbd = open_device(path);
+    if (nbd < 0)
+      break;
+    if (ioctl(nbd, NBD_SET_SOCK, fd[0]) != 0)
+      cout << path << std::endl;
+    else
+      ioctl(nbd, NBD_CLEAR_SOCK);
+    close(nbd);
+    m++;
+  }
+
+  close(fd[0]);
+  close(fd[1]);
+}
+
+static int rbd_nbd(int argc, const char *argv[])
+{
+  int r;
+  enum {
+    None,
+    Connect,
+    Disconnect,
+    List
+  } cmd = None;
+
+  vector<const char*> args;
+
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_DAEMON,
+              CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+  std::vector<const char*>::iterator i;
+
+  for (i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+      usage();
+      return 0;
+    } else if (ceph_argparse_witharg(args, i, &devpath, "--device", (char *)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &nbds_max, cerr, "--nbds_max", (char *)NULL)) {
+    } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+      readonly = true;
+    } else {
+      ++i;
+    }
+  }
+
+  if (args.begin() != args.end()) {
+    if (strcmp(*args.begin(), "map") == 0) {
+      cmd = Connect;
+    } else if (strcmp(*args.begin(), "unmap") == 0) {
+      cmd = Disconnect;
+    } else if (strcmp(*args.begin(), "list-mapped") == 0) {
+      cmd = List;
+    } else {
+      cerr << "rbd-nbd: unknown command: " << *args.begin() << std::endl;
+      return EXIT_FAILURE;
+    }
+    args.erase(args.begin());
+  }
+
+  if (cmd == None) {
+    cerr << "rbd-nbd: must specify command" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  switch (cmd) {
+    case Connect:
+      if (args.begin() == args.end()) {
+        cerr << "rbd-nbd: must specify image-or-snap-spec" << std::endl;
+        return EXIT_FAILURE;
+      }
+      if (parse_imgpath(string(*args.begin())) < 0)
+        return EXIT_FAILURE;
+      args.erase(args.begin());
+      break;
+    case Disconnect:
+      if (args.begin() == args.end()) {
+        cerr << "rbd-nbd: must specify nbd device path" << std::endl;
+        return EXIT_FAILURE;
+      }
+      devpath = *args.begin();
+      args.erase(args.begin());
+      break;
+    default:
+      //shut up gcc;
+      break;
+  }
+
+  if (args.begin() != args.end()) {
+    cerr << "rbd-nbd: unknown args: " << *args.begin() << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  switch (cmd) {
+    case Connect:
+      common_init_finish(g_ceph_context);
+
+      if (imgname.empty()) {
+        cerr << "rbd-nbd: image name was not specified" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      r = do_map();
+      if (r < 0)
+        return EXIT_FAILURE;
+      break;
+    case Disconnect:
+      r = do_unmap();
+      if (r < 0)
+        return EXIT_FAILURE;
+      break;
+    case List:
+      list_mapped_devices();
+      break;
+    default:
+      usage();
+      return EXIT_FAILURE;
+  }
+
+  return 0;
+}
+
+int main(int argc, const char *argv[])
+{
+  std::string err;
+
+  if (forker.prefork(err) < 0) {
+    cerr << err << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  if (forker.is_child()) {
+    forker.exit(rbd_nbd(argc, argv));
+  } else if (forker.parent_wait(err) < 0) {
+    cerr << err << std::endl;
+    return EXIT_FAILURE;
+  } else {
+    return 0;
+  }
+}
diff --git a/src/tools/scratchtool.c b/src/tools/scratchtool.c
index 17d5c6b..19d5e01 100644
--- a/src/tools/scratchtool.c
+++ b/src/tools/scratchtool.c
@@ -80,7 +80,7 @@ static int do_rados_getxattrs(rados_ioctx_t io_ctx, const char *oid,
 		for (i = 0; i < nval; ++i) {
 			if (strcmp(exkeys[i], key))
 				continue;
-			if ((len == strlen(exvals[i]) + 1) && (!strcmp(exvals[i], val))) {
+			if ((len == strlen(exvals[i]) + 1) && (val != NULL) && (!strcmp(exvals[i], val))) {
 				nfound++;
 				break;
 			}
diff --git a/src/tracing/librbd.tp b/src/tracing/librbd.tp
index f6e1cb1..146e06e 100644
--- a/src/tracing/librbd.tp
+++ b/src/tracing/librbd.tp
@@ -504,6 +504,23 @@ TRACEPOINT_EVENT(librbd, invalidate_cache_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, poll_io_events_enter,
+    TP_ARGS(
+        void*, imagectx,
+        int, numcomp),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+        ctf_integer(int, numcomp, numcomp)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, poll_io_events_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
 TRACEPOINT_EVENT(librbd, metadata_get_enter,
     TP_ARGS(
         void*, imagectx,
@@ -1634,6 +1651,28 @@ TRACEPOINT_EVENT(librbd, get_flags_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, set_image_notification_enter,
+    TP_ARGS(
+        void*, imagectx,
+        int, fd,
+        int, type),
+    TP_FIELDS(
+      ctf_integer_hex(void*, imagectx, imagectx)
+      ctf_integer(int, fd, fd)
+      ctf_integer(int, type, type)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, set_image_notification_exit,
+    TP_ARGS(
+        void*, imagectx,
+        int, retval),
+    TP_FIELDS(
+      ctf_integer_hex(void*, imagectx, imagectx)
+      ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, is_exclusive_lock_owner_enter,
     TP_ARGS(
         void*, imagectx),
diff --git a/src/vstart.sh b/src/vstart.sh
index 7e32047..daf4d7d 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -3,6 +3,18 @@
 # abort on failure
 set -e
 
+if [ -n "$VSTART_DEST" ]; then
+  SRC_PATH=`dirname $0`
+  SRC_PATH=`(cd $SRC_PATH; pwd)`
+
+  CEPH_DIR=$SRC_PATH
+  CEPH_BIN=$SRC_PATH
+  CEPH_LIB=$SRC_PATH/.libs
+  CEPH_CONF_PATH=$VSTART_DEST
+  CEPH_DEV_DIR=$VSTART_DEST/dev
+  CEPH_OUT_DIR=$VSTART_DEST/out
+fi
+
 if [ -e CMakeCache.txt ]; then
   # Out of tree build, learn source location from CMakeCache.txt
   SRC_ROOT=`grep Ceph_SOURCE_DIR CMakeCache.txt | cut -d "=" -f 2`
@@ -59,6 +71,7 @@ export DYLD_LIBRARY_PATH=$CEPH_LIB:$DYLD_LIBRARY_PATH
 [ -z "$CEPH_DEV_DIR" ] && CEPH_DEV_DIR="$CEPH_DIR/dev"
 [ -z "$CEPH_OUT_DIR" ] && CEPH_OUT_DIR="$CEPH_DIR/out"
 [ -z "$CEPH_RGW_PORT" ] && CEPH_RGW_PORT=8000
+[ -z "$CEPH_CONF_PATH" ] && CEPH_CONF_PATH=$CEPH_DIR
 
 extra_conf=""
 new=0
@@ -83,8 +96,8 @@ journal=1
 
 MON_ADDR=""
 
-conf_fn="$CEPH_DIR/ceph.conf"
-keyring_fn="$CEPH_DIR/keyring"
+conf_fn="$CEPH_CONF_PATH/ceph.conf"
+keyring_fn="$CEPH_CONF_PATH/keyring"
 osdmap_fn="/tmp/ceph_osdmap.$$"
 monmap_fn="/tmp/ceph_monmap.$$"
 
@@ -409,6 +422,7 @@ if [ "$start_mon" -eq 1 ]; then
         osd crush chooseleaf type = 0
         osd pool default min size = 1
         osd failsafe full ratio = .99
+        mon osd reporter subtree level = osd
         mon osd full ratio = .99
         mon data avail warn = 10
         mon data avail crit = 1
@@ -460,7 +474,7 @@ $DAEMONOPTS
         osd journal size = 100
         osd class tmp = out
         osd class dir = $OBJCLASS_PATH
-        osd scrub load threshold = 5.0
+        osd scrub load threshold = 2000.0
         osd debug op order = true
         filestore wbthrottle xfs ios start flusher = 10
         filestore wbthrottle xfs ios hard limit = 20
diff --git a/systemd/Makefile.am b/systemd/Makefile.am
index 1a43415..02dee06 100644
--- a/systemd/Makefile.am
+++ b/systemd/Makefile.am
@@ -1,5 +1,9 @@
 unitfiles = \
 	ceph.target \
+        ceph-osd.target \
+        ceph-mon.target \
+        ceph-mds.target \
+        ceph-radosgw.target \
 	ceph-mds at .service \
 	ceph-mon at .service \
 	ceph-create-keys at .service \
diff --git a/systemd/Makefile.in b/systemd/Makefile.in
index 9d26136..17a6821 100644
--- a/systemd/Makefile.in
+++ b/systemd/Makefile.in
@@ -186,6 +186,7 @@ CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
+CYTHON_CHECK = @CYTHON_CHECK@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -272,6 +273,7 @@ PYTHON_PLATFORM = @PYTHON_PLATFORM@
 PYTHON_PREFIX = @PYTHON_PREFIX@
 PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
+RDYNAMIC_FLAG = @RDYNAMIC_FLAG@
 RESOLV_LIBS = @RESOLV_LIBS@
 RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
@@ -354,6 +356,10 @@ top_srcdir = @top_srcdir@
 user_rgw = @user_rgw@
 unitfiles = \
 	ceph.target \
+        ceph-osd.target \
+        ceph-mon.target \
+        ceph-mds.target \
+        ceph-radosgw.target \
 	ceph-mds at .service \
 	ceph-mon at .service \
 	ceph-create-keys at .service \
diff --git a/systemd/ceph-disk at .service b/systemd/ceph-disk at .service
index cff7e9f..8b18ba4 100644
--- a/systemd/ceph-disk at .service
+++ b/systemd/ceph-disk at .service
@@ -4,5 +4,5 @@ Description=Ceph disk activation: %f
 [Service]
 Type=oneshot
 KillMode=none
-ExecStart=/bin/flock /var/lock/ceph-disk -c '/usr/sbin/ceph-disk --verbose --log-stdout trigger --sync %f'
+ExecStart=/bin/sh -c 'flock /var/lock/ceph-disk /usr/sbin/ceph-disk --verbose --log-stdout trigger --sync %f'
 TimeoutSec=0
diff --git a/systemd/ceph-mds.target b/systemd/ceph-mds.target
new file mode 100644
index 0000000..fbf1ba1
--- /dev/null
+++ b/systemd/ceph-mds.target
@@ -0,0 +1,5 @@
+[Unit]
+Description=ceph target allowing to start/stop all ceph-mds at .service instances at once
+PartOf=ceph.target
+[Install]
+WantedBy=multi-user.target ceph.target
diff --git a/systemd/ceph-mds at .service b/systemd/ceph-mds at .service
index f86f4ee..708f42c 100644
--- a/systemd/ceph-mds at .service
+++ b/systemd/ceph-mds at .service
@@ -2,7 +2,7 @@
 Description=Ceph metadata server daemon
 After=network-online.target local-fs.target
 Wants=network-online.target local-fs.target
-PartOf=ceph.target
+PartOf=ceph-mds.target
 
 [Service]
 LimitNOFILE=1048576
@@ -13,4 +13,4 @@ ExecStart=/usr/bin/ceph-mds -f --cluster ${CLUSTER} --id %i --setuser ceph --set
 ExecReload=/bin/kill -HUP $MAINPID
 
 [Install]
-WantedBy=ceph.target
+WantedBy=ceph-mds.target
diff --git a/systemd/ceph-mon.target b/systemd/ceph-mon.target
new file mode 100644
index 0000000..87b585f
--- /dev/null
+++ b/systemd/ceph-mon.target
@@ -0,0 +1,5 @@
+[Unit]
+Description=ceph target allowing to start/stop all ceph-mon at .service instances at once
+PartOf=ceph.target
+[Install]
+WantedBy=multi-user.target ceph.target
diff --git a/systemd/ceph-mon at .service b/systemd/ceph-mon at .service
index a0eeff8..03a9b6c 100644
--- a/systemd/ceph-mon at .service
+++ b/systemd/ceph-mon at .service
@@ -8,7 +8,7 @@ Description=Ceph cluster monitor daemon
 After=network-online.target local-fs.target ceph-create-keys@%i.service
 Wants=network-online.target local-fs.target ceph-create-keys@%i.service
 
-PartOf=ceph.target
+PartOf=ceph-mon.target
 
 [Service]
 LimitNOFILE=1048576
@@ -19,4 +19,4 @@ ExecStart=/usr/bin/ceph-mon -f --cluster ${CLUSTER} --id %i --setuser ceph --set
 ExecReload=/bin/kill -HUP $MAINPID
 
 [Install]
-WantedBy=ceph.target
+WantedBy=ceph-mon.target
diff --git a/systemd/ceph-osd.target b/systemd/ceph-osd.target
new file mode 100644
index 0000000..ed55fc2
--- /dev/null
+++ b/systemd/ceph-osd.target
@@ -0,0 +1,5 @@
+[Unit]
+Description=ceph target allowing to start/stop all ceph-osd at .service instances at once
+PartOf=ceph.target
+[Install]
+WantedBy=multi-user.target ceph.target
diff --git a/systemd/ceph-osd at .service b/systemd/ceph-osd at .service
index 5a9314e..82dabdf 100644
--- a/systemd/ceph-osd at .service
+++ b/systemd/ceph-osd at .service
@@ -2,7 +2,7 @@
 Description=Ceph object storage daemon
 After=network-online.target local-fs.target
 Wants=network-online.target local-fs.target
-PartOf=ceph.target
+PartOf=ceph-osd.target
 
 [Service]
 LimitNOFILE=1048576
@@ -14,4 +14,4 @@ ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id %i
 ExecReload=/bin/kill -HUP $MAINPID
 
 [Install]
-WantedBy=ceph.target
+WantedBy=ceph-osd.target
diff --git a/systemd/ceph-radosgw.target b/systemd/ceph-radosgw.target
new file mode 100644
index 0000000..959eb51
--- /dev/null
+++ b/systemd/ceph-radosgw.target
@@ -0,0 +1,5 @@
+[Unit]
+Description=ceph target allowing to start/stop all ceph-radosgw at .service instances at once
+PartOf=ceph.target
+[Install]
+WantedBy=multi-user.target ceph.target
diff --git a/systemd/ceph-radosgw at .service b/systemd/ceph-radosgw at .service
index fccd011..fb09e19 100644
--- a/systemd/ceph-radosgw at .service
+++ b/systemd/ceph-radosgw at .service
@@ -2,7 +2,7 @@
 Description=Ceph rados gateway
 After=network-online.target local-fs.target
 Wants=network-online.target local-fs.target
-PartOf=ceph.target
+PartOf=ceph-radosgw.target
 
 [Service]
 LimitNOFILE=1048576
@@ -12,4 +12,4 @@ Environment=CLUSTER=ceph
 ExecStart=/usr/bin/radosgw -f --cluster ${CLUSTER} --name client.%i --setuser ceph --setgroup ceph
 
 [Install]
-WantedBy=ceph.target
+WantedBy=ceph-radosgw.target

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list