[Pkg-ceph-commits] [ceph] 02/05: Imported Upstream version 10.1.2

James Downing Page jamespage at moszumanska.debian.org
Thu Apr 14 09:10:53 UTC 2016


This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch ubuntu-xenial
in repository ceph.

commit e35cc1756373df95a7f0e30dbdff6fccdef24385
Author: James Page <james.page at ubuntu.com>
Date:   Thu Apr 14 10:04:52 2016 +0100

    Imported Upstream version 10.1.2
---
 AUTHORS                                    |   7 +-
 ChangeLog                                  | 124 +++++++++++-
 ceph.spec                                  |   2 +-
 configure                                  |  20 +-
 configure.ac                               |   2 +-
 doc/man/8/ceph-authtool.rst                |  40 +++-
 doc/man/8/radosgw-admin.rst                |   2 +-
 man/ceph-authtool.8                        |  46 ++++-
 man/ceph-clsinfo.8                         |   2 +-
 man/ceph-conf.8                            |   2 +-
 man/ceph-create-keys.8                     |   2 +-
 man/ceph-debugpack.8                       |   2 +-
 man/ceph-dencoder.8                        |   2 +-
 man/ceph-deploy.8                          |   2 +-
 man/ceph-detect-init.8                     |   2 +-
 man/ceph-disk.8                            |   2 +-
 man/ceph-fuse.8                            |   2 +-
 man/ceph-mds.8                             |   2 +-
 man/ceph-mon.8                             |   2 +-
 man/ceph-osd.8                             |   2 +-
 man/ceph-post-file.8                       |   2 +-
 man/ceph-rbdnamer.8                        |   2 +-
 man/ceph-rest-api.8                        |   2 +-
 man/ceph-run.8                             |   2 +-
 man/ceph-syn.8                             |   2 +-
 man/ceph.8                                 |   2 +-
 man/cephfs.8                               |   2 +-
 man/crushtool.8                            |   2 +-
 man/librados-config.8                      |   2 +-
 man/monmaptool.8                           |   2 +-
 man/mount.ceph.8                           |   2 +-
 man/osdmaptool.8                           |   2 +-
 man/rados.8                                |   2 +-
 man/radosgw-admin.8                        |   4 +-
 man/radosgw.8                              |   2 +-
 man/rbd-fuse.8                             |   2 +-
 man/rbd-mirror.8                           |   2 +-
 man/rbd-nbd.8                              |   2 +-
 man/rbd-replay-many.8                      |   2 +-
 man/rbd-replay-prep.8                      |   2 +-
 man/rbd-replay.8                           |   2 +-
 man/rbd.8                                  |   2 +-
 man/rbdmap.8                               |   2 +-
 src/.git_version                           |   4 +-
 src/Makefile-env.am                        |   4 +-
 src/Makefile.in                            |   8 +-
 src/ceph-disk/ceph_disk/main.py            |  18 +-
 src/ceph_fuse.cc                           |   7 +
 src/ceph_osd.cc                            |   3 +-
 src/client/Client.cc                       |  22 ++-
 src/client/Client.h                        |   2 +-
 src/cls/rgw/cls_rgw_ops.h                  |   6 +-
 src/common/Thread.cc                       |   5 +-
 src/common/buffer.cc                       |  10 +-
 src/common/ceph_time.h                     |   7 +-
 src/common/config_opts.h                   |  14 ++
 src/common/fs_types.cc                     |  16 +-
 src/crush/CrushWrapper.cc                  |   2 +-
 src/include/ceph_fs.h                      |   5 +
 src/include/fs_types.h                     |   2 +
 src/include/rados.h                        |   5 -
 src/journal/FutureImpl.cc                  |  14 +-
 src/journal/FutureImpl.h                   |   6 +-
 src/journal/JournalRecorder.cc             |   3 +-
 src/journal/JournalTrimmer.cc              |   3 +-
 src/journal/ObjectPlayer.cc                |  15 +-
 src/librados/IoCtxImpl.cc                  | 109 +++++++----
 src/librbd/ImageCtx.cc                     |   2 +-
 src/librbd/Journal.cc                      |  10 +-
 src/librbd/internal.cc                     |  16 +-
 src/logrotate.conf                         |   2 +-
 src/mds/Beacon.cc                          |   7 +
 src/mds/CInode.cc                          |  47 ++---
 src/mds/CInode.h                           |   9 +
 src/mds/FSMap.cc                           |  50 ++++-
 src/mds/FSMap.h                            |   9 +-
 src/mds/Locker.cc                          |   1 +
 src/mds/MDBalancer.cc                      |   2 +
 src/mds/MDCache.cc                         |   5 +
 src/mds/MDSDaemon.cc                       |  26 ++-
 src/mds/MDSMap.cc                          |  32 +++-
 src/mds/MDSMap.h                           |  30 ++-
 src/mds/MDSRank.cc                         |  48 +++--
 src/mds/events/ESessions.h                 |   2 +-
 src/mds/journal.cc                         |  19 ++
 src/messages/MMDSBeacon.h                  |   3 +-
 src/mon/MDSMonitor.cc                      | 101 +++++++++-
 src/mon/MonCommands.h                      |  11 +-
 src/mon/OSDMonitor.cc                      |  13 +-
 src/os/ObjectStore.cc                      |  11 +-
 src/os/ObjectStore.h                       |  16 +-
 src/os/bluestore/BlueFS.cc                 | 244 +++++++++++++++++-------
 src/os/bluestore/BlueFS.h                  |  59 ++++--
 src/os/bluestore/BlueStore.cc              |  63 ++++---
 src/os/bluestore/BlueStore.h               |   4 +-
 src/os/filestore/FileStore.cc              |  53 +++++-
 src/os/filestore/FileStore.h               |   8 +-
 src/os/filestore/IndexManager.cc           |   5 +-
 src/os/filestore/LFNIndex.cc               | 211 +++++++++++----------
 src/os/filestore/LFNIndex.h                |  25 ++-
 src/os/filestore/chain_xattr.cc            | 110 ++++-------
 src/os/filestore/chain_xattr.h             |  99 +++++++++-
 src/os/kstore/KStore.h                     |   5 +-
 src/os/memstore/MemStore.h                 |   4 +-
 src/osd/OSD.cc                             |  24 +--
 src/osd/OSD.h                              |   8 +-
 src/osd/PG.cc                              |  19 +-
 src/osd/ReplicatedPG.cc                    |  70 +++++--
 src/osdc/Objecter.cc                       |  26 +--
 src/osdc/Objecter.h                        | 290 ++++++++++++++++++++---------
 src/pybind/ceph_rest_api.py                |   9 -
 src/rgw/librgw.cc                          |  15 +-
 src/rgw/rgw_admin.cc                       |  17 +-
 src/rgw/rgw_common.h                       |   1 +
 src/rgw/rgw_ldap.h                         |   2 +
 src/rgw/rgw_op.cc                          |   6 +-
 src/rgw/rgw_op.h                           |   2 +
 src/rgw/rgw_rest.cc                        |   5 +
 src/rgw/rgw_rest_conn.cc                   |   2 +-
 src/rgw/rgw_rest_s3.cc                     |  18 +-
 src/rgw/rgw_sync.cc                        |  25 ++-
 src/test/cli/ceph-authtool/help.t          |  11 +-
 src/test/cli/ceph-authtool/manpage.t       |  11 +-
 src/test/cli/ceph-authtool/simple.t        |  11 +-
 src/test/encoding/check-generated.sh       |   2 +-
 src/test/encoding/types.h                  |   4 +
 src/test/journal/test_FutureImpl.cc        |  43 +++--
 src/test/journal/test_ObjectRecorder.cc    |  55 ++----
 src/test/librados/misc.cc                  | 108 ++++++++++-
 src/test/librbd/test_mock_Journal.cc       |   3 +
 src/test/librgw_file_nfsns.cc              |  15 ++
 src/test/objectstore/chain_xattr.cc        | 114 ++++++++++++
 src/test/objectstore/test_bluefs.cc        |  20 +-
 src/test/os/TestLFNIndex.cc                |   2 +-
 src/test/pybind/test_ceph_argparse.py      |   5 +
 src/test/rbd_mirror/test_ClusterWatcher.cc |   6 +-
 src/test/rbd_mirror/test_ImageReplayer.cc  |  11 +-
 src/test/rbd_mirror/test_ImageSync.cc      |   5 +
 src/test/rbd_mirror/test_PoolWatcher.cc    |   6 +-
 src/test/rbd_mirror/test_fixture.cc        |   6 +-
 src/tools/Makefile-client.am               |   1 +
 src/tools/ceph_authtool.cc                 |  15 +-
 src/tools/rbd/action/Journal.cc            |  42 ++---
 src/tools/rbd_mirror/Replayer.cc           |  56 ++++--
 src/vstart.sh                              |   2 +-
 systemd/ceph-mds at .service                  |   1 +
 systemd/ceph-mon at .service                  |   1 +
 systemd/ceph-osd at .service                  |   1 +
 systemd/ceph-radosgw at .service              |   1 +
 systemd/ceph-rbd-mirror at .service           |   1 +
 150 files changed, 2165 insertions(+), 900 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index f0a01f6..eba7075 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -2,6 +2,7 @@ Aaron Bassett <abassett at gmail.com>
 Abhishek Dixit <dixitabhi at gmail.com>
 Abhishek Lekshmanan <abhishek.lekshmanan at ril.com>
 Abhishek Lekshmanan <abhishek at suse.com>
+Abhishek Lekshmanan <alekshmanan at suse.com>
 Abhishek Varshney <abhishek.varshney at flipkart.com>
 Accela Zhao <accelazh at gmail.com>
 Adam C. Emerson <aemerson at linuxbox.com>
@@ -228,7 +229,7 @@ Jean-Rémi Deveaux <jeanremi.deveaux at gmail.com>
 Jeff Epstein <jepst79 at gmail.com>
 Jeffrey Lu <lzhng2000 at aliyun.com>
 Jeff Weber <jweber at cofront.net>
-Jenkins Build Slave User <jenkins-build at trusty-small-unique--5c6e9c4e-81af-43d3-957d-c650c692c441.localdomain>
+Jenkins Build Slave User <jenkins-build at trusty-small-unique--a7f82f5f-8832-433e-a632-928924f47e04.localdomain>
 Jenkins <jenkins at ceph.com>
 Jens-Christian Fischer <jens-christian.fischer at switch.ch>
 Jeremy Qian <vanpire110 at 163.com>
@@ -289,6 +290,7 @@ Kevin Jones <k.j.jonez at gmail.com>
 Kim Vandry <vandry at TZoNE.ORG>
 Kiseleva Alyona <akiselyova at mirantis.com>
 Kongming Wu <wu.kongming at h3c.com>
+Kris Jurka <kjurka at locatortechnologies.com>
 Krzysztof Kosiński <krzysztof.kosinski at intel.com>
 Kuan Kai Chiu <big.chiu at bigtera.com>
 Kun Huang <academicgareth at gmail.com>
@@ -417,6 +419,7 @@ Ross Turk <rturk at redhat.com>
 Ruben Kerkhof <ruben at rubenkerkhof.com>
 Ruifeng Yang <yangruifeng.09209 at h3c.com>
 runsisi <runsisi at hust.edu.cn>
+runsisi <runsisi at zte.com.cn>
 Rust Shen <rustinpeace at 163.com>
 Rutger ter Borg <rutger at terborg.net>
 Sage Weil <sage at inktank.com>
@@ -430,7 +433,6 @@ Sandon Van Ness <sandon at inktank.com>
 Sandon Van Ness <svanness at redhat.com>
 Sangdi Xu <xu.sangdi at h3c.com>
 Sarthak Munshi <sarthakmunshi at gmail.com>
-scienceluo <luo.kexue at zte.com.cn>
 Scott A. Brandt <scott at cs.ucsc.edu>
 Scott Devoid <devoid at anl.gov>
 Sean Channel <pentabular at gmail.com>
@@ -516,6 +518,7 @@ Weijun Duan <duanweijun at h3c.com>
 Wei Luo <luowei at yahoo-inc.com>
 Wei Luo <weilluo at tencent.com>
 Wei Qian <weiq at dtdream.com>
+weiqiaomiao <wei.qiaomiao at zte.com.cn>
 Wenjun Huang <wenjunhuang at tencent.com>
 Wesley Spikes <wesley.spikes at dreamhost.com>
 Wido den Hollander <wido at 42on.com>
diff --git a/ChangeLog b/ChangeLog
index 4c75c68..55473c6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,107 @@
-ce50389 (HEAD, tag: v10.1.1, origin/jewel) 10.1.1
+4a2a6f7 (HEAD, tag: v10.1.2, origin/jewel) 10.1.2
+8b98556 PG: set epoch_created and parent_split_bits for child pg
+bd1c548 test: fix ut test failure caused by lfn change
+45219e0 Fixed ceph-common install.
+fd2f455 mds: take standby_for_fscid into account in FSMap::find_unused()
+b6d8c32 librbd: Fixed bug in disabling non-primary image mirroring
+14a66f6 ceph-disk: fix PrepareData.set_type() args should be class member.
+4c203b3 ceph-disk: fix spelling mistake geattr to getattr.
+5b098ca ceph-disk: fix lockbox set_or_create_partition()
+b7708da radosgw-admin: fix name checking
+770846b radosgw-admin: allow setting zone when there is no realm
+49886d5 check-generated.sh: can't source bash from sh
+8bc8085 tests: add Ubuntu 16.04 xenial dockerfile
+0e4a92e crush: fix typo
+0a622e6 doc: rgw admin uses "region list" not "regions list"
+3c77292 journal: fix final result for JournalTrimmer::C_RemoveSet
+3b54d5d test/rados/misc.cc: add long locator key and namespace tests
+cb03d4d LFNIndex: use chain_getxattr_buf
+c7db303 chain_xattr: add chain_getxattr_buf
+755c685 LFNIndex::lfn_parse_object_name: return int rather than bool
+e4916f6 LFNIndex::list_objects: lfn_translate does not set errno
+25f937e FileStore::set_xattr_limits_via_conf: add warning if max xattr size smaller than max name
+18b9f95 test/objectstore/chain_xattr.cc: add test for ensure_single_attr
+73778f1 LFNIndex: ensure that lfn and directory attrs are written atomically
+ac750ce chain_[f]getxattr: always use size, no reaon to consider CHAIN_XATTR_MAX_BLOCK_LEN
+8dc0330 chain_xattr: s/onechunk/skip_chain_cleanup, add ensure_single_attr
+21487fd os/,osd/: restructure the rados name length check
+8770043 ceph-disk: fix set_data_partition() when data is partition.
+a330078 rgw-admin: fix period delete error message
+3320f8f rgw-admin: remove unused iterator
+64a8a6a rbd-mirror: fixed bug that caused infinite loop when disabling image mirroring
+a651598 mailmap: Luo Kexue name normalization
+c36c5d4 mailmap: Ning Yao affiliation
+eb536bb mailmap: Vitja Makarov affiliation
+323276e release-notes: v10.1.1 release notes
+18d06ef release-notes: v10.1.1 release notes (draft)
+1be8dba mailmap: sunspot affiliation
+c256a9c mailmap: Qinghua Jin affiliation
+c312b52 mailmap: Richard W.M. Jones affiliation
+b441722 mailmap: Ira Cooper affiliation
+f4f9d91 mailmap: Star Guo affiliation
+ceb2a72 mailmap: Xiaoxi Chen affiliation
+885db95 mailmap: Jenkins name normalization
+fb63721 releases: firefly was EOL december 2015
+fc29dc2 releases: update the v10.1.0 release link
+d82c497 doc: add a page for CephFS' experimental features describing lockout rules
+6974ed4 doc: update cephfs administration page for enable_multiple safety check
+4b3802e mdsmonitor: unify experimental warnings
+ea1b276 test: update tests for new flag requirements
+4bc4e74 MDSMap: lock out multimds clusters and directory fragmentation by default
+15d1fd4 MDSMap: switch from booleans to flags for feature enablement logging
+cda1c1a FSMap: add output for enabled_multiple flags
+3130132 MDSMonitor: make enabling multiple FSes a lot scarier
+3b9371a FSMap: print out compat instead of erroneously repeating enable_multiple
+2429463 ceph-dencoder: add FSMap
+15cabdc rgw_ldap: make ldap.h inclusion conditional
+d58e5fe rgw: fix problem deleting objects begining with double underscore
+492a572 test: fix memory leaks in rbd-mirror test cases
+acfc2b1 test: avoid leaking librados connections when creating pools
+88e244c rbd: journal reset should disable/re-enable journaling feature
+2fa4147 osd/ReplicatedPG: clean up temp object if copy-from fails
+a0bb575 ceph_test_rados_api_misc: make CopyFrom omap test be a big object
+134416a Revert "rados: Add new field flags for ceph_osd_op.copy_get."
+017d830 rbd-mirror: workaround for intermingled lockdep singletons
+98744fd logrotate.conf: poke ceph-fuse after log rotation
+91e0be0 ceph-fuse: reopen log file on SIGHUP
+60679fc librbd: restore out-of-band future callbacks to avoid lock cycles
+e98d046 MDS: unregister command add in clean_up_admin_socket
+2f4bc84 rgw: aws4 subdomain calling bugfix
+dab0b55 rgw: the map 'headers' is assigned a wrong value
+f01261f authtool: fix test output validation & minor whitespace.
+408964e journal: fix context memory leak when shutting down live replay
+f931066 journal: Future does not require metadata shared pointer
+d3dbd85 mon: warn if 'sortbitwise' flag is not set and no legacy OSDs are present
+d544e44 mds: validate file layouts during replay
+9414bef debian/rules: include ceph-mds-*.conf upstart files in ceph-mds
+45a0bc1 mds: add operator<< for file_layout_t
+693f46a mds: skip size recovery on non-file inodes
+0f09642 mds: tidy up backtrace pool handling
+4ddcf41 mds: health metric for being read only
+79b19a6 osd: cancel scrub if noscrub is set for pool or all
+4d3aef7 osd: reset tp handle when search for boundary of chunky-scrub
+25d8007 os/bluestore: use short, relative paths with bluefs
+ce50389 (tag: v10.1.1) 10.1.1
+6c0ab75 librbd: disallow unsafe rbd_op_threads values
+bb4c2ca librgw/rgw_file:  correctly handle object permissions
+6851822 rgw_file: print DIRS1 read parameters at verbose
+d84f55f rgw_file:  fix attributes for "special" test cases
+1bd1ffd rgw_file unit tests: validate Unix owners in DIRS1
+8e2c804 authtool: update --help and manpage to match code.
+dffd867 build: Respect TMPDIR for virtualenv.
+048251b common/fs_types: dump pool_id signed
+cd41ca2 mds: fix legacy layout decode with pool 0
+b8e0458 rgw: retry read_log_info() while master is down
+05cafcf Drop any systemd imposed process/thread limits
 02ab8a2 mrun: update path to cmake binaries
+67f8f1f os/bluestore/BlueFS: add some perfcounters
+75ddd73 os/bluestore/BlueFS: revamp bdev ids
+a5564a6 os/ObjectStore: make device uuid probe output something friendly
+4f6523d rgw: aws4 uri encoding bugfix
+bc9607b mon/OSDMonitor: fix off-by-one for osd_map_message_max
+81cc288 osd: improve full map requests
+2e22f54 osd: create rerequest_full_maps() helper
+961a46f client: fix pool permisson check
 d248128 config: fix setuser_match_path typo
 d5ec33f tests: Removing one ceph-dencoder call in check-generated.sh
 4af1aa6 tests: Fixing python statement in ceph_objectstore_tool.py
@@ -18,10 +120,24 @@ d66c852 tests: Adding parallelism for sequential ceph-dencoder calls
 8b6be11 tests: Adding parallelism to encoding/readable.sh
 db31cc6 tests: Adding parallelism helpers in ceph-helpers.sh
 93ace63 cmake: fix the build of test_rados_api_list
+524f8e6 mds: unregister newly added commands correctly
+da27c33 mds: avoid key renew storm on clock skew
+7dae094 mds: drop message reference on error exit
+1f54f73 mds: fix message leak during handle_core_message()
 b7a5f8b test: TestMirroringWatcher test cases were not closing images
+ae39517 rgw: fix a typo in error message
 8231208 global/global_init: expand metavariables in setuser_match_path
+1a6c686 mds: Add cmapv to ESessions default constructor initializer list
 dd167cf crush: fix error log
+3562323 librados: use Objecter::prepare_*_op helpers to set c->tid safely
+cd167c0 osdc/Objecter: create more prepare_foo_op() helpers
 f47e06b tests: Fixing broken test/cephtool-test-mon.sh test
+8f9e7b0 rgw: Do not try to encode or decode time_t
+778506e time: Change temporary variable types in time decode
+4a88a7f librados: fix narrow race with tid return value assignment
+b7eb86f osdc/Objecter: fix narrow race with tid assignment
+1de73d7 qa/workunits/rest/test.py: fs flag set enable_multiple true
+42e692a ceph-rest-api: do not include single-option CephChoices in prefix
 9565a50 set 128MB tcmalloc cache size by bytes
 ff9843b Striper: reduce assemble_result log level
 f812199 qa/workunits/rbd: qemu tests need to wait for image to be created
@@ -36,6 +152,7 @@ a92fa83 osdmap: rm nonused variable
 c432691 os/ObjectStore: add noexcept to ensure move ctor is used
 1c2831a common/Cycles: Do not initialize Cycles globally.
 ec79b64 unittest_erasure_code_plugin: fix deadlock caused by locked mutex in cancelled thread
+554d1b4 ReplicatedPG::_rollback_to: update the OMAP flag
 aedc529 test: Fix test to run with btrfs which has snap_### dirs
 3dd5249 librbd: avoid throwing error if mirroring is unsupported
 280b8a1 rgw: add exclusive flag to set_as_default()
@@ -44,7 +161,9 @@ aedc529 test: Fix test to run with btrfs which has snap_### dirs
 c4efef5 rgw: add a few missing cmdline switches in help
 09b5356 cls_journal: fix -EEXIST checking
 2c0f03a rgw_admin: remove unused parent_period arg
+85229f6 mds: fix potential null pointer access
 a29b96a debian/rules: put init-ceph in /etc/init.d/ceph, not ceph-base
+09c4195 MDSMonitor: introduce command 'fs set_default <fs_name>'
 602425a configure: Add -D_LARGEFILE64_SOURCE to Linux build.
 639f158 mon: remove unnecessary comment for update_from_paxos
 f5ef4d4 cmake: add missing librbd/MirrorWatcher.cc and librd/ObjectWatcher.cc
@@ -161,6 +280,7 @@ f3ebe46 rbd: rbd-mirroring: Updated rbd mirroring unit tests to reflect the new
 bc254c8 rbd: rbd-mirroring: Enabling image mirroring depends on pool mirroring mode
 668c8f9 script: subscription-manager support (part 3)
 97b74bd osd/ClassHandler: only dlclose() the classes not missing
+dad3b84 client: pass 'newly issued caps' to Client::check_cap_issue()
 349c81f ceph_test_rados_api_pool: fix command for readonly cache-mode
 ad2e6f4 ceph.in: update for cmake path changes
 5da6ae8 vstart: update for cmake build path changes
@@ -592,6 +712,8 @@ a0a8dcc rgw: free components on shutdown
 bdcff15 mds: fix FSMap upgrade on mixed mon versions
 38fd3f1 rgw: LDAP pass-through authentication
 e52f7b4 mds: fix FSMap upgrade with daemons in the map
+7b33156 common: thread: allow set_affinity() to return a error code
+e10c6e4 common: buffer: put a guard for stat() syscall during read_file
 1ea1735 osd: fix wrong counter for batch objects removal during remove_dir()
 12d151f osd: initialize last_recalibrate field at construction
 f1a4490 ceph.spec.in: disable lttng and babeltrace explicitly
diff --git a/ceph.spec b/ceph.spec
index 5190979..265a7c9 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -74,7 +74,7 @@ restorecon -R /var/log/radosgw > /dev/null 2>&1;
 # common
 #################################################################################
 Name:		ceph
-Version:	10.1.1
+Version:	10.1.2
 Release:	0%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
diff --git a/configure b/configure
index 01072fd..1a373da 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 10.1.1.
+# Generated by GNU Autoconf 2.69 for ceph 10.1.2.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='10.1.1'
-PACKAGE_STRING='ceph 10.1.1'
+PACKAGE_VERSION='10.1.2'
+PACKAGE_STRING='ceph 10.1.2'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1582,7 +1582,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 10.1.1 to adapt to many kinds of systems.
+\`configure' configures ceph 10.1.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1653,7 +1653,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 10.1.1:";;
+     short | recursive ) echo "Configuration of ceph 10.1.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1837,7 +1837,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 10.1.1
+ceph configure 10.1.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2913,7 +2913,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 10.1.1, which was
+It was created by ceph $as_me 10.1.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -16408,7 +16408,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='10.1.1'
+ VERSION='10.1.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -26100,7 +26100,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 10.1.1, which was
+This file was extended by ceph $as_me 10.1.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -26166,7 +26166,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 10.1.1
+ceph config.status 10.1.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index efd760a..0d6427f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [10.1.1], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [10.1.2], [ceph-devel at vger.kernel.org])
 
 AX_CXX_COMPILE_STDCXX_11(, mandatory)
 
diff --git a/doc/man/8/ceph-authtool.rst b/doc/man/8/ceph-authtool.rst
index 8565eba..0187d89 100644
--- a/doc/man/8/ceph-authtool.rst
+++ b/doc/man/8/ceph-authtool.rst
@@ -9,9 +9,18 @@
 Synopsis
 ========
 
-| **ceph-authtool** *keyringfile* [ -l | --list ] [ -C | --create-keyring
-  ] [ -p | --print ] [ -n | --name *entityname* ] [ --gen-key ] [ -a |
-  --add-key *base64_key* ] [ --caps *capfile* ]
+| **ceph-authtool** *keyringfile*
+  [ -l | --list ]
+  [ -p | --print ]
+  [ -C | --create-keyring ]
+  [ -g | --gen-key ]
+  [ --gen-print-key ]
+  [ --import-keyring *otherkeyringfile* ]
+  [ -n | --name *entityname* ]
+  [ -u | --set-uid *auid* ]
+  [ -a | --add-key *base64_key* ]
+  [ --cap *subsystem* *capability* ]
+  [ --caps *capfile* ]
 
 
 Description
@@ -45,19 +54,36 @@ Options
 
    will create a new keyring, overwriting any existing keyringfile
 
-.. option:: --gen-key
+.. option:: -g, --gen-key
 
    will generate a new secret key for the specified entityname
 
-.. option:: --add-key
+.. option:: --gen-print-key
+
+   will generate a new secret key for the specified entityname,
+   without altering the keyringfile, printing the secret to stdout
+
+.. option:: --import-keyring *secondkeyringfile*
+
+   will import the content of a given keyring to the keyringfile
+
+.. option:: -n, --name *name*
+
+   specify entityname to operate on
+
+.. option:: -u, --set-uid *auid*
+
+   sets the auid (authenticated user id) for the specified entityname
+
+.. option:: -a, --add-key *base64_key*
 
    will add an encoded key to the keyring
 
-.. option:: --cap subsystem capability
+.. option:: --cap *subsystem* *capability*
 
    will set the capability for given subsystem
 
-.. option:: --caps capsfile
+.. option:: --caps *capsfile*
 
    will set all of capabilities associated with a given key, for all subsystems
 
diff --git a/doc/man/8/radosgw-admin.rst b/doc/man/8/radosgw-admin.rst
index 54d690e..b4d75ff 100644
--- a/doc/man/8/radosgw-admin.rst
+++ b/doc/man/8/radosgw-admin.rst
@@ -107,7 +107,7 @@ which are as follows:
 :command:`region get`
   Show region info.
 
-:command:`regions list`
+:command:`region list`
   List all regions set on this cluster.
 
 :command:`region set`
diff --git a/man/ceph-authtool.8 b/man/ceph-authtool.8
index 075efe3..44c7610 100644
--- a/man/ceph-authtool.8
+++ b/man/ceph-authtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-AUTHTOOL" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-AUTHTOOL" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-authtool \- ceph keyring manipulation tool
 .
@@ -32,9 +32,18 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 ..
 .SH SYNOPSIS
 .nf
-\fBceph\-authtool\fP \fIkeyringfile\fP [ \-l | \-\-list ] [ \-C | \-\-create\-keyring
-] [ \-p | \-\-print ] [ \-n | \-\-name \fIentityname\fP ] [ \-\-gen\-key ] [ \-a |
-\-\-add\-key \fIbase64_key\fP ] [ \-\-caps \fIcapfile\fP ]
+\fBceph\-authtool\fP \fIkeyringfile\fP
+[ \-l | \-\-list ]
+[ \-p | \-\-print ]
+[ \-C | \-\-create\-keyring ]
+[ \-g | \-\-gen\-key ]
+[ \-\-gen\-print\-key ]
+[ \-\-import\-keyring \fIotherkeyringfile\fP ]
+[ \-n | \-\-name \fIentityname\fP ]
+[ \-u | \-\-set\-uid \fIauid\fP ]
+[ \-a | \-\-add\-key \fIbase64_key\fP ]
+[ \-\-cap \fIsubsystem\fP \fIcapability\fP ]
+[ \-\-caps \fIcapfile\fP ]
 .fi
 .sp
 .SH DESCRIPTION
@@ -69,22 +78,43 @@ will create a new keyring, overwriting any existing keyringfile
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-gen\-key
+.B \-g, \-\-gen\-key
 will generate a new secret key for the specified entityname
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-add\-key
+.B \-\-gen\-print\-key
+will generate a new secret key for the specified entityname,
+without altering the keyringfile, printing the secret to stdout
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-import\-keyring *secondkeyringfile*
+will import the content of a given keyring to the keyringfile
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-n, \-\-name *name*
+specify entityname to operate on
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-u, \-\-set\-uid *auid*
+sets the auid (authenticated user id) for the specified entityname
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-a, \-\-add\-key *base64_key*
 will add an encoded key to the keyring
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-cap subsystem capability
+.B \-\-cap *subsystem* *capability*
 will set the capability for given subsystem
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-caps capsfile
+.B \-\-caps *capsfile*
 will set all of capabilities associated with a given key, for all subsystems
 .UNINDENT
 .SH CAPABILITIES
diff --git a/man/ceph-clsinfo.8 b/man/ceph-clsinfo.8
index d1974b0..62dfccb 100644
--- a/man/ceph-clsinfo.8
+++ b/man/ceph-clsinfo.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CLSINFO" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-CLSINFO" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-clsinfo \- show class object information
 .
diff --git a/man/ceph-conf.8 b/man/ceph-conf.8
index 73ea29c..ec78d67 100644
--- a/man/ceph-conf.8
+++ b/man/ceph-conf.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CONF" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-CONF" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-conf \- ceph conf file tool
 .
diff --git a/man/ceph-create-keys.8 b/man/ceph-create-keys.8
index d9c5ea4..04d8248 100644
--- a/man/ceph-create-keys.8
+++ b/man/ceph-create-keys.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CREATE-KEYS" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-CREATE-KEYS" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-create-keys \- ceph keyring generate tool
 .
diff --git a/man/ceph-debugpack.8 b/man/ceph-debugpack.8
index f6efc37..b130dc6 100644
--- a/man/ceph-debugpack.8
+++ b/man/ceph-debugpack.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEBUGPACK" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-DEBUGPACK" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-debugpack \- ceph debug packer utility
 .
diff --git a/man/ceph-dencoder.8 b/man/ceph-dencoder.8
index afe930d..3637d95 100644
--- a/man/ceph-dencoder.8
+++ b/man/ceph-dencoder.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DENCODER" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-DENCODER" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-dencoder \- ceph encoder/decoder utility
 .
diff --git a/man/ceph-deploy.8 b/man/ceph-deploy.8
index 0ca7600..e3d5d4a 100644
--- a/man/ceph-deploy.8
+++ b/man/ceph-deploy.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEPLOY" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-DEPLOY" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-deploy \- Ceph deployment tool
 .
diff --git a/man/ceph-detect-init.8 b/man/ceph-detect-init.8
index 21a65f4..ccd3e6c 100644
--- a/man/ceph-detect-init.8
+++ b/man/ceph-detect-init.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DETECT-INIT" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-DETECT-INIT" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-detect-init \- display the init system Ceph should use
 .
diff --git a/man/ceph-disk.8 b/man/ceph-disk.8
index fa4728e..c59faf9 100644
--- a/man/ceph-disk.8
+++ b/man/ceph-disk.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DISK" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-DISK" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-disk \- Ceph disk utility for OSD
 .
diff --git a/man/ceph-fuse.8 b/man/ceph-fuse.8
index 0a03a44..3cea297 100644
--- a/man/ceph-fuse.8
+++ b/man/ceph-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-FUSE" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-FUSE" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-fuse \- FUSE-based client for ceph
 .
diff --git a/man/ceph-mds.8 b/man/ceph-mds.8
index ac0cc19..e77e3c2 100644
--- a/man/ceph-mds.8
+++ b/man/ceph-mds.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MDS" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-MDS" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-mds \- ceph metadata server daemon
 .
diff --git a/man/ceph-mon.8 b/man/ceph-mon.8
index d2a8707..a8c744f 100644
--- a/man/ceph-mon.8
+++ b/man/ceph-mon.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MON" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-MON" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-mon \- ceph monitor daemon
 .
diff --git a/man/ceph-osd.8 b/man/ceph-osd.8
index a7c6345..94dd69c 100644
--- a/man/ceph-osd.8
+++ b/man/ceph-osd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-OSD" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-OSD" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-osd \- ceph object storage daemon
 .
diff --git a/man/ceph-post-file.8 b/man/ceph-post-file.8
index b49d538..a762e4e 100644
--- a/man/ceph-post-file.8
+++ b/man/ceph-post-file.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-POST-FILE" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-POST-FILE" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-post-file \- post files for ceph developers
 .
diff --git a/man/ceph-rbdnamer.8 b/man/ceph-rbdnamer.8
index 28278a7..64749e8 100644
--- a/man/ceph-rbdnamer.8
+++ b/man/ceph-rbdnamer.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RBDNAMER" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-RBDNAMER" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-rbdnamer \- udev helper to name RBD devices
 .
diff --git a/man/ceph-rest-api.8 b/man/ceph-rest-api.8
index 6d42a39..62ea726 100644
--- a/man/ceph-rest-api.8
+++ b/man/ceph-rest-api.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-REST-API" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-REST-API" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-rest-api \- ceph RESTlike administration server
 .
diff --git a/man/ceph-run.8 b/man/ceph-run.8
index f1f9743..1c78647 100644
--- a/man/ceph-run.8
+++ b/man/ceph-run.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RUN" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-RUN" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-run \- restart daemon on core dump
 .
diff --git a/man/ceph-syn.8 b/man/ceph-syn.8
index 05f5a91..0fc4b59 100644
--- a/man/ceph-syn.8
+++ b/man/ceph-syn.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-SYN" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH-SYN" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph-syn \- ceph synthetic workload generator
 .
diff --git a/man/ceph.8 b/man/ceph.8
index 9ee403b..1a33b1b 100644
--- a/man/ceph.8
+++ b/man/ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPH" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 ceph \- ceph administration tool
 .
diff --git a/man/cephfs.8 b/man/cephfs.8
index 2772697..8e234ad 100644
--- a/man/cephfs.8
+++ b/man/cephfs.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPHFS" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CEPHFS" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 cephfs \- ceph file system options utility
 .
diff --git a/man/crushtool.8 b/man/crushtool.8
index c7ebbd7..4859e14 100644
--- a/man/crushtool.8
+++ b/man/crushtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CRUSHTOOL" "8" "April 06, 2016" "dev" "Ceph"
+.TH "CRUSHTOOL" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 crushtool \- CRUSH map manipulation tool
 .
diff --git a/man/librados-config.8 b/man/librados-config.8
index d6567e5..fc0d539 100644
--- a/man/librados-config.8
+++ b/man/librados-config.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "LIBRADOS-CONFIG" "8" "April 06, 2016" "dev" "Ceph"
+.TH "LIBRADOS-CONFIG" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 librados-config \- display information about librados
 .
diff --git a/man/monmaptool.8 b/man/monmaptool.8
index ac6f7f2..a505fe6 100644
--- a/man/monmaptool.8
+++ b/man/monmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MONMAPTOOL" "8" "April 06, 2016" "dev" "Ceph"
+.TH "MONMAPTOOL" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 monmaptool \- ceph monitor cluster map manipulation tool
 .
diff --git a/man/mount.ceph.8 b/man/mount.ceph.8
index eb9ba29..cf0cf68 100644
--- a/man/mount.ceph.8
+++ b/man/mount.ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MOUNT.CEPH" "8" "April 06, 2016" "dev" "Ceph"
+.TH "MOUNT.CEPH" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 mount.ceph \- mount a ceph file system
 .
diff --git a/man/osdmaptool.8 b/man/osdmaptool.8
index c369b14..1d5e4a7 100644
--- a/man/osdmaptool.8
+++ b/man/osdmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "OSDMAPTOOL" "8" "April 06, 2016" "dev" "Ceph"
+.TH "OSDMAPTOOL" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 osdmaptool \- ceph osd cluster map manipulation tool
 .
diff --git a/man/rados.8 b/man/rados.8
index 5cf4aac..670e5ff 100644
--- a/man/rados.8
+++ b/man/rados.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOS" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RADOS" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rados \- rados object storage utility
 .
diff --git a/man/radosgw-admin.8 b/man/radosgw-admin.8
index d77aff4..8da8cc2 100644
--- a/man/radosgw-admin.8
+++ b/man/radosgw-admin.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW-ADMIN" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RADOSGW-ADMIN" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 radosgw-admin \- rados REST gateway user administration utility
 .
@@ -127,7 +127,7 @@ Disable quota.
 .B \fBregion get\fP
 Show region info.
 .TP
-.B \fBregions list\fP
+.B \fBregion list\fP
 List all regions set on this cluster.
 .TP
 .B \fBregion set\fP
diff --git a/man/radosgw.8 b/man/radosgw.8
index 1402859..5daa2c8 100644
--- a/man/radosgw.8
+++ b/man/radosgw.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RADOSGW" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 radosgw \- rados REST gateway
 .
diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8
index 744f90b..eaff3ef 100644
--- a/man/rbd-fuse.8
+++ b/man/rbd-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-FUSE" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD-FUSE" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd-fuse \- expose rbd images as files
 .
diff --git a/man/rbd-mirror.8 b/man/rbd-mirror.8
index 7f99e32..df2c65d 100644
--- a/man/rbd-mirror.8
+++ b/man/rbd-mirror.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-MIRROR" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD-MIRROR" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd-mirror \- Ceph daemon for mirroring RBD images
 .
diff --git a/man/rbd-nbd.8 b/man/rbd-nbd.8
index 38b46eb..fafa198 100644
--- a/man/rbd-nbd.8
+++ b/man/rbd-nbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-NBD" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD-NBD" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd-nbd \- map rbd images to nbd device
 .
diff --git a/man/rbd-replay-many.8 b/man/rbd-replay-many.8
index ea3f0dd..0bb3f1c 100644
--- a/man/rbd-replay-many.8
+++ b/man/rbd-replay-many.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-MANY" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD-REPLAY-MANY" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay-many \- replay a rados block device (RBD) workload on several clients
 .
diff --git a/man/rbd-replay-prep.8 b/man/rbd-replay-prep.8
index c0a080f..49af438 100644
--- a/man/rbd-replay-prep.8
+++ b/man/rbd-replay-prep.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-PREP" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD-REPLAY-PREP" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay-prep \- prepare captured rados block device (RBD) workloads for replay
 .
diff --git a/man/rbd-replay.8 b/man/rbd-replay.8
index 593563b..d876089 100644
--- a/man/rbd-replay.8
+++ b/man/rbd-replay.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD-REPLAY" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd-replay \- replay rados block device (RBD) workloads
 .
diff --git a/man/rbd.8 b/man/rbd.8
index a0e5603..dc19338 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBD" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbd \- manage rados block device (RBD) images
 .
diff --git a/man/rbdmap.8 b/man/rbdmap.8
index b746517..ebb7d5d 100644
--- a/man/rbdmap.8
+++ b/man/rbdmap.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBDMAP" "8" "April 06, 2016" "dev" "Ceph"
+.TH "RBDMAP" "8" "April 12, 2016" "dev" "Ceph"
 .SH NAME
 rbdmap \- map RBD devices at boot time
 .
diff --git a/src/.git_version b/src/.git_version
index 2f3b43d..17a9430 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-ce50389b773fe7f72fca40a3dd69cfe6613eaeb1
-v10.1.1
+4a2a6f72640d6b74a3bbd92798bb913ed380dcd4
+v10.1.2
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index 2fb22a6..df225d6 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -299,6 +299,8 @@ DENCODER_DEPS =
 
 # put virtualenvs in this directory
 # otherwise it may overflow #! 80 kernel limit
-export CEPH_BUILD_VIRTUALENV = /tmp
+# beware that some build environments might not be able to write to /tmp
+export TMPDIR ?= /tmp
+export CEPH_BUILD_VIRTUALENV = $(TMPDIR)
 
 radoslibdir = $(libdir)/rados-classes
diff --git a/src/Makefile.in b/src/Makefile.in
index 1c8f334..4cf18ab 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -5153,6 +5153,8 @@ am__rbd_SOURCES_DIST = tools/rbd/rbd.cc tools/rbd/ArgumentTypes.cc \
 rbd_OBJECTS = $(am_rbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_DEPENDENCIES = libjournal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
@@ -12261,6 +12263,8 @@ ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_267)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_LDADD = libjournal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
@@ -32388,7 +32392,9 @@ export PYTHONPATH=$(top_srcdir)/src/pybind
 
 # put virtualenvs in this directory
 # otherwise it may overflow #! 80 kernel limit
-export CEPH_BUILD_VIRTUALENV = /tmp
+# beware that some build environments might not be able to write to /tmp
+export TMPDIR ?= /tmp
+export CEPH_BUILD_VIRTUALENV = $(TMPDIR)
 
 @NO_GIT_VERSION_TRUE at export NO_VERSION="yes"
 
diff --git a/src/ceph-disk/ceph_disk/main.py b/src/ceph-disk/ceph_disk/main.py
index d0ec596..45e9bb2 100755
--- a/src/ceph-disk/ceph_disk/main.py
+++ b/src/ceph-disk/ceph_disk/main.py
@@ -122,7 +122,7 @@ class Ptype(object):
 
     @staticmethod
     def get_ready_by_name(name):
-        return [x[name]['ready'] for x in PTYPE.values()]
+        return [x[name]['ready'] for x in PTYPE.values() if name in x]
 
     @staticmethod
     def is_regular_space(ptype):
@@ -1876,18 +1876,18 @@ class PrepareSpace(object):
         if stat.S_ISBLK(mode):
             if getattr(args, name + '_file'):
                 raise Error('%s is not a regular file' % name.capitalize,
-                            geattr(args, name))
+                            getattr(args, name))
             self.type = self.DEVICE
             return
 
         if stat.S_ISREG(mode):
             if getattr(args, name + '_dev'):
                 raise Error('%s is not a block device' % name.capitalize,
-                            geattr(args, name))
+                            getattr(args, name))
             self.type = self.FILE
 
         raise Error('%s %s is neither a block device nor regular file' %
-                    (name.capitalize, geattr(args, name)))
+                    (name.capitalize, getattr(args, name)))
 
     def is_none(self):
         return self.type == self.NONE
@@ -2241,8 +2241,8 @@ class Lockbox(object):
                       self.args.lockbox)
             self.partition = DevicePartition.factory(
                 path=None, dev=self.args.lockbox, args=self.args)
-            ptype = partition.get_ptype()
-            ready = Ptype.get_ready_by_type('lockbox')
+            ptype = self.partition.get_ptype()
+            ready = Ptype.get_ready_by_name('lockbox')
             if ptype not in ready:
                 LOG.warning('incorrect partition UUID: %s, expected %s'
                             % (ptype, str(ready)))
@@ -2384,7 +2384,7 @@ class PrepareData(object):
         elif stat.S_ISBLK(dmode):
             self.type = self.DEVICE
         else:
-            raise Error('not a dir or block device', args.data)
+            raise Error('not a dir or block device', self.args.data)
 
     def is_file(self):
         return self.type == self.FILE
@@ -2539,8 +2539,8 @@ class PrepareData(object):
                       self.args.data)
             self.partition = DevicePartition.factory(
                 path=None, dev=self.args.data, args=self.args)
-            ptype = partition.get_ptype()
-            ready = Ptype.get_ready_by_type('osd')
+            ptype = self.partition.get_ptype()
+            ready = Ptype.get_ready_by_name('osd')
             if ptype not in ready:
                 LOG.warning('incorrect partition UUID: %s, expected %s'
                             % (ptype, str(ready)))
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 1b72288..a682448 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -34,6 +34,7 @@ using namespace std;
 #include "common/linux_version.h"
 #endif
 #include "global/global_init.h"
+#include "global/signal_handler.h"
 #include "common/safe_io.h"
        
 #include <sys/types.h>
@@ -223,6 +224,9 @@ int main(int argc, const char **argv, const char *envp[]) {
       goto out_messenger_start_failed;
     }
 
+    init_async_signal_handler();
+    register_async_signal_handler(SIGHUP, sighup_handler);
+
     // start client
     r = client->init();
     if (r < 0) {
@@ -268,6 +272,9 @@ int main(int argc, const char **argv, const char *envp[]) {
   out_shutdown:
     client->shutdown();
   out_init_failed:
+    unregister_async_signal_handler(SIGHUP, sighup_handler);
+    shutdown_async_signal_handler();
+
     // wait for messenger to finish
     messenger->shutdown();
     messenger->wait();
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 0c25fb6..7deb5a1 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -189,7 +189,8 @@ int main(int argc, const char **argv)
   }
   if (get_device_fsid) {
     uuid_d uuid;
-    int r = ObjectStore::probe_block_device_fsid(device_path, &uuid);
+    int r = ObjectStore::probe_block_device_fsid(g_ceph_context, device_path,
+						 &uuid);
     if (r < 0) {
       cerr << "failed to get device fsid for " << device_path
 	   << ": " << cpp_strerror(r) << std::endl;
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 177a61e..7fcd907 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -4807,7 +4807,7 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
     check = true;
 
-  check_cap_issue(in, cap, issued);
+  check_cap_issue(in, cap, new_caps);
 
   // update caps
   if (old_caps & ~new_caps) { 
@@ -12099,10 +12099,12 @@ int Client::check_pool_perm(Inode *in, int need)
   if (!cct->_conf->client_check_pool_perm)
     return 0;
 
-  int64_t pool = in->layout.pool_id;
+  int64_t pool_id = in->layout.pool_id;
+  std::string pool_ns = in->layout.pool_ns;
+  std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
   int have = 0;
   while (true) {
-    std::map<int64_t, int>::iterator it = pool_perms.find(pool);
+    auto it = pool_perms.find(perm_key);
     if (it == pool_perms.end())
       break;
     if (it->second == POOL_CHECKING) {
@@ -12123,7 +12125,7 @@ int Client::check_pool_perm(Inode *in, int need)
       return 0;
     }
 
-    pool_perms[pool] = POOL_CHECKING;
+    pool_perms[perm_key] = POOL_CHECKING;
 
     char oid_buf[32];
     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
@@ -12155,7 +12157,7 @@ int Client::check_pool_perm(Inode *in, int need)
     if (rd_ret == 0 || rd_ret == -ENOENT)
       have |= POOL_READ;
     else if (rd_ret != -EPERM) {
-      ldout(cct, 10) << "check_pool_perm on pool " << pool
+      ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
 		     << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
       errored = true;
     }
@@ -12163,7 +12165,7 @@ int Client::check_pool_perm(Inode *in, int need)
     if (wr_ret == 0 || wr_ret == -EEXIST)
       have |= POOL_WRITE;
     else if (wr_ret != -EPERM) {
-      ldout(cct, 10) << "check_pool_perm on pool " << pool
+      ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
 		     << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
       errored = true;
     }
@@ -12172,22 +12174,22 @@ int Client::check_pool_perm(Inode *in, int need)
       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
       // Raise EIO because actual error code might be misleading for
       // userspace filesystem user.
-      pool_perms.erase(pool);
+      pool_perms.erase(perm_key);
       signal_cond_list(waiting_for_pool_perm);
       return -EIO;
     }
 
-    pool_perms[pool] = have | POOL_CHECKED;
+    pool_perms[perm_key] = have | POOL_CHECKED;
     signal_cond_list(waiting_for_pool_perm);
   }
 
   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
-    ldout(cct, 10) << "check_pool_perm on pool " << pool
+    ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
 		   << " need " << ccap_string(need) << ", but no read perm" << dendl;
     return -EPERM;
   }
   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
-    ldout(cct, 10) << "check_pool_perm on pool " << pool
+    ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
 		   << " need " << ccap_string(need) << ", but no write perm" << dendl;
     return -EPERM;
   }
diff --git a/src/client/Client.h b/src/client/Client.h
index d53ca1d..d912db0 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -541,7 +541,7 @@ protected:
   bool is_quota_bytes_exceeded(Inode *in, int64_t new_bytes);
   bool is_quota_bytes_approaching(Inode *in);
 
-  std::map<int64_t, int> pool_perms;
+  std::map<std::pair<int64_t,std::string>, int> pool_perms;
   list<Cond*> waiting_for_pool_perm;
   int check_pool_perm(Inode *in, int need);
 
diff --git a/src/cls/rgw/cls_rgw_ops.h b/src/cls/rgw/cls_rgw_ops.h
index e8a7661..7d8ad2d 100644
--- a/src/cls/rgw/cls_rgw_ops.h
+++ b/src/cls/rgw/cls_rgw_ops.h
@@ -180,7 +180,7 @@ struct rgw_cls_link_olh_op {
     ::encode(olh_epoch, bl);
     ::encode(log_op, bl);
     ::encode(bilog_flags, bl);
-    time_t t = ceph::real_clock::to_time_t(unmod_since);
+    uint64_t t = ceph::real_clock::to_time_t(unmod_since);
     ::encode(t, bl);
     ::encode(unmod_since, bl);
     ::encode(high_precision_time, bl);
@@ -198,9 +198,9 @@ struct rgw_cls_link_olh_op {
     ::decode(log_op, bl);
     ::decode(bilog_flags, bl);
     if (struct_v == 2) {
-      time_t t;
+      uint64_t t;
       ::decode(t, bl);
-      unmod_since = ceph::real_clock::from_time_t(t);
+      unmod_since = ceph::real_clock::from_time_t(static_cast<time_t>(t));
     }
     if (struct_v >= 3) {
       ::decode(unmod_since, bl);
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 1f716f9..c1c3be5 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -200,8 +200,9 @@ int Thread::set_ioprio(int cls, int prio)
 
 int Thread::set_affinity(int id)
 {
+  int r = 0;
   cpuid = id;
   if (pid && ceph_gettid() == pid)
-    _set_affinity(id);
-  return 0;
+    r = _set_affinity(id);
+  return r;
 }
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 0368979..63339ea 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1982,7 +1982,15 @@ int buffer::list::read_file(const char *fn, std::string *error)
 
   struct stat st;
   memset(&st, 0, sizeof(st));
-  ::fstat(fd, &st);
+  if (::fstat(fd, &st) < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): stat error: "
+        << cpp_strerror(err);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return -err;
+  }
 
   ssize_t ret = read_fd(fd, st.st_size);
   if (ret < 0) {
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h
index ef187c0..2c7061a 100644
--- a/src/common/ceph_time.h
+++ b/src/common/ceph_time.h
@@ -382,10 +382,13 @@ void encode(const std::chrono::time_point<Clock, Duration>& t,
 template<typename Clock, typename Duration>
 void decode(std::chrono::time_point<Clock, Duration>& t,
 	    bufferlist::iterator& p) {
-  uint32_t s, ns;
+  uint32_t s;
+  uint32_t ns;
   ::decode(s, p);
   ::decode(ns, p);
-  struct timespec ts = {s, ns};
+  struct timespec ts = {
+    static_cast<time_t>(s),
+    static_cast<long int>(ns)};
 
   t = Clock::from_timespec(ts);
 }
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 50356c7..c2a577f 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -271,6 +271,7 @@ OPTION(mon_crush_min_required_version, OPT_STR, "firefly")
 OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0
 OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
 OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
+OPTION(mon_warn_on_no_sortbitwise, OPT_BOOL, true)  // warn when sortbitwise not set
 OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
 OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
 OPTION(mon_max_log_epochs, OPT_INT, 500)
@@ -869,6 +870,7 @@ OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
 
 OPTION(osd_max_object_size, OPT_U64, 100*1024L*1024L*1024L) // OSD's maximum object size
 OPTION(osd_max_object_name_len, OPT_U32, 2048) // max rados object name len
+OPTION(osd_max_object_namespace_len, OPT_U32, 256) // max rados object namespace len
 OPTION(osd_max_attr_name_len, OPT_U32, 100)    // max rados attr name len; cannot go higher than 100 chars for file system backends
 OPTION(osd_max_attr_size, OPT_U64, 0)
 
@@ -1026,6 +1028,18 @@ OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
 OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
 OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
 
+// max xattr value size
+OPTION(filestore_max_xattr_value_size, OPT_U32, 0)	//Override
+OPTION(filestore_max_xattr_value_size_xfs, OPT_U32, 64<<10)
+OPTION(filestore_max_xattr_value_size_btrfs, OPT_U32, 64<<10)
+// ext4 allows 4k xattrs total including some smallish extra fields and the
+// keys.  We're allowing 2 512 inline attrs in addition some some filestore
+// replay attrs.  After accounting for those, we still need to fit up to
+// two attrs of this value.  That means we need this value to be around 1k
+// to be safe.  This is hacky, but it's not worth complicating the code
+// to work around ext4's total xattr limit.
+OPTION(filestore_max_xattr_value_size_other, OPT_U32, 1<<10)
+
 OPTION(filestore_sloppy_crc, OPT_BOOL, false)         // track sloppy crcs
 OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
 
diff --git a/src/common/fs_types.cc b/src/common/fs_types.cc
index 741e4f1..929f3f2 100644
--- a/src/common/fs_types.cc
+++ b/src/common/fs_types.cc
@@ -46,8 +46,9 @@ void file_layout_t::from_legacy(const ceph_file_layout& fl)
   stripe_count = fl.fl_stripe_count;
   object_size = fl.fl_object_size;
   pool_id = (int32_t)fl.fl_pg_pool;
-  // in the legacy encoding, pool 0 was undefined.
-  if (pool_id == 0)
+  // in the legacy encoding, a zeroed structure was the default and
+  // would have pool 0 instead of -1.
+  if (pool_id == 0 && stripe_unit == 0 && stripe_count == 0 && object_size == 0)
     pool_id = -1;
   pool_ns.clear();
 }
@@ -108,7 +109,7 @@ void file_layout_t::dump(Formatter *f) const
   f->dump_unsigned("stripe_unit", stripe_unit);
   f->dump_unsigned("stripe_count", stripe_count);
   f->dump_unsigned("object_size", object_size);
-  f->dump_unsigned("pool_id", pool_id);
+  f->dump_int("pool_id", pool_id);
   f->dump_string("pool_ns", pool_ns);
 }
 
@@ -122,3 +123,12 @@ void file_layout_t::generate_test_instances(list<file_layout_t*>& o)
   o.back()->pool_id = 3;
   o.back()->pool_ns = "myns";
 }
+
+ostream& operator<<(ostream& out, const file_layout_t &layout)
+{
+  JSONFormatter f;
+  layout.dump(&f);
+  f.flush(out);
+  return out;
+}
+
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 5748078..3450b48 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -353,7 +353,7 @@ int CrushWrapper::remove_item_under(CephContext *cct, int item, int ancestor, bo
   if (item < 0 && !unlink_only) {
     crush_bucket *t = get_bucket(item);
     if (t && t->size) {
-      ldout(cct, 1) << "remove_item_undef bucket " << item << " has " << t->size
+      ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
 		    << " items, not empty" << dendl;
       return -ENOTEMPTY;
     }
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 593fd5f..fe0a8d5 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -234,6 +234,11 @@ struct ceph_mon_subscribe_ack {
  */
 #define CEPH_MDSMAP_DOWN    (1<<0)  /* cluster deliberately down */
 #define CEPH_MDSMAP_ALLOW_SNAPS   (1<<1)  /* cluster allowed to create snapshots */
+#define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) /* cluster allowed to have >1 active MDS */
+#define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) /* cluster allowed to fragment directories */
+
+#define CEPH_MDSMAP_ALLOW_CLASSICS (CEPH_MDSMAP_ALLOW_SNAPS | CEPH_MDSMAP_ALLOW_MULTIMDS | \
+				    CEPH_MDSMAP_ALLOW_DIRFRAGS)
 
 /*
  * mds states
diff --git a/src/include/fs_types.h b/src/include/fs_types.h
index 388508b..c9271cc 100644
--- a/src/include/fs_types.h
+++ b/src/include/fs_types.h
@@ -103,4 +103,6 @@ WRITE_CLASS_ENCODER_FEATURES(file_layout_t)
 
 WRITE_EQ_OPERATORS_5(file_layout_t, stripe_unit, stripe_count, object_size, pool_id, pool_ns);
 
+ostream& operator<<(ostream& out, const file_layout_t &layout);
+
 #endif
diff --git a/src/include/rados.h b/src/include/rados.h
index f14d677..c58277f 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -446,10 +446,6 @@ enum {
 };
 
 enum {
-	CEPH_OSD_COPY_GET_FLAG_NOTSUPP_OMAP = 1, /* mean dest pool don't support omap*/
-};
-
-enum {
 	CEPH_OSD_TMAP2OMAP_NULLOK = 1,
 };
 
@@ -516,7 +512,6 @@ struct ceph_osd_op {
 		} __attribute__ ((packed)) clonerange;
 		struct {
 			__le64 max;     /* max data in reply */
-			__le32 flags;
 		} __attribute__ ((packed)) copy_get;
 		struct {
 			__le64 snapid;
diff --git a/src/journal/FutureImpl.cc b/src/journal/FutureImpl.cc
index 11eda44..aebfe12 100644
--- a/src/journal/FutureImpl.cc
+++ b/src/journal/FutureImpl.cc
@@ -2,15 +2,14 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "journal/FutureImpl.h"
-#include "journal/JournalMetadata.h"
 #include "journal/Utils.h"
 
 namespace journal {
 
-FutureImpl::FutureImpl(JournalMetadataPtr journal_metadata, uint64_t tag_tid,
-                       uint64_t entry_tid, uint64_t commit_tid)
-  : RefCountedObject(NULL, 0), m_journal_metadata(journal_metadata),
-    m_tag_tid(tag_tid), m_entry_tid(entry_tid), m_commit_tid(commit_tid),
+FutureImpl::FutureImpl(uint64_t tag_tid, uint64_t entry_tid,
+                       uint64_t commit_tid)
+  : RefCountedObject(NULL, 0), m_tag_tid(tag_tid), m_entry_tid(entry_tid),
+    m_commit_tid(commit_tid),
     m_lock(utils::unique_lock_name("FutureImpl::m_lock", this)), m_safe(false),
     m_consistent(false), m_return_value(0), m_flush_state(FLUSH_STATE_NONE),
     m_consistent_ack(this) {
@@ -51,7 +50,7 @@ void FutureImpl::flush(Context *on_safe) {
   }
 
   if (complete && on_safe != NULL) {
-    m_journal_metadata->queue(on_safe, m_return_value);
+    on_safe->complete(m_return_value);
   } else if (flush_handler) {
     // attached to journal object -- instruct it to flush all entries through
     // this one.  possible to become detached while lock is released, so flush
@@ -69,7 +68,8 @@ void FutureImpl::wait(Context *on_safe) {
       return;
     }
   }
-  m_journal_metadata->queue(on_safe, m_return_value);
+
+  on_safe->complete(m_return_value);
 }
 
 bool FutureImpl::is_complete() const {
diff --git a/src/journal/FutureImpl.h b/src/journal/FutureImpl.h
index 0a9eba5..5c11c4b 100644
--- a/src/journal/FutureImpl.h
+++ b/src/journal/FutureImpl.h
@@ -18,7 +18,6 @@ class Context;
 namespace journal {
 
 class FutureImpl;
-class JournalMetadata;
 typedef boost::intrusive_ptr<FutureImpl> FutureImplPtr;
 
 class FutureImpl : public RefCountedObject, boost::noncopyable {
@@ -29,11 +28,9 @@ public:
     virtual void get() = 0;
     virtual void put() = 0;
   };
-  typedef boost::intrusive_ptr<JournalMetadata> JournalMetadataPtr;
   typedef boost::intrusive_ptr<FlushHandler> FlushHandlerPtr;
 
-  FutureImpl(JournalMetadataPtr journal_metadata, uint64_t tag_tid,
-             uint64_t entry_tid, uint64_t commit_tid);
+  FutureImpl(uint64_t tag_tid, uint64_t entry_tid, uint64_t commit_tid);
 
   void init(const FutureImplPtr &prev_future);
 
@@ -96,7 +93,6 @@ private:
     virtual void finish(int r) {}
   };
 
-  JournalMetadataPtr m_journal_metadata;
   uint64_t m_tag_tid;
   uint64_t m_entry_tid;
   uint64_t m_commit_tid;
diff --git a/src/journal/JournalRecorder.cc b/src/journal/JournalRecorder.cc
index 065f692..b730b26 100644
--- a/src/journal/JournalRecorder.cc
+++ b/src/journal/JournalRecorder.cc
@@ -80,8 +80,7 @@ Future JournalRecorder::append(uint64_t tag_tid,
   ObjectRecorderPtr object_ptr = get_object(splay_offset);
   uint64_t commit_tid = m_journal_metadata->allocate_commit_tid(
     object_ptr->get_object_number(), tag_tid, entry_tid);
-  FutureImplPtr future(new FutureImpl(m_journal_metadata, tag_tid, entry_tid,
-                                      commit_tid));
+  FutureImplPtr future(new FutureImpl(tag_tid, entry_tid, commit_tid));
   future->init(m_prev_future);
   m_prev_future = future;
 
diff --git a/src/journal/JournalTrimmer.cc b/src/journal/JournalTrimmer.cc
index 68ba5f4..74df78a 100644
--- a/src/journal/JournalTrimmer.cc
+++ b/src/journal/JournalTrimmer.cc
@@ -194,7 +194,8 @@ JournalTrimmer::C_RemoveSet::C_RemoveSet(JournalTrimmer *_journal_trimmer,
 
 void JournalTrimmer::C_RemoveSet::complete(int r) {
   lock.Lock();
-  if (r < 0 && r != -ENOENT && return_value == -ENOENT) {
+  if (r < 0 && r != -ENOENT &&
+      (return_value == -ENOENT || return_value == 0)) {
     return_value = r;
   } else if (r == 0 && return_value == -ENOENT) {
     return_value = 0;
diff --git a/src/journal/ObjectPlayer.cc b/src/journal/ObjectPlayer.cc
index e890dfa..db49d46 100644
--- a/src/journal/ObjectPlayer.cc
+++ b/src/journal/ObjectPlayer.cc
@@ -70,9 +70,15 @@ void ObjectPlayer::watch(Context *on_fetch, double interval) {
 void ObjectPlayer::unwatch() {
   ldout(m_cct, 20) << __func__ << ": " << m_oid << " unwatch" << dendl;
   Mutex::Locker timer_locker(m_timer_lock);
+
   cancel_watch();
 
-  m_watch_ctx = NULL;
+  Context *watch_ctx = nullptr;
+  std::swap(watch_ctx, m_watch_ctx);
+  if (watch_ctx != nullptr) {
+    delete watch_ctx;
+  }
+
   while (m_watch_in_progress) {
     m_watch_in_progress_cond.Wait(m_timer_lock);
   }
@@ -202,18 +208,17 @@ void ObjectPlayer::handle_watch_fetched(int r) {
   ldout(m_cct, 10) << __func__ << ": " << m_oid << " poll complete, r=" << r
                    << dendl;
 
-  Context *on_finish = NULL;
+  Context *on_finish = nullptr;
   {
     Mutex::Locker timer_locker(m_timer_lock);
     assert(m_watch_in_progress);
     if (r == -ENOENT) {
       r = 0;
     }
-    on_finish = m_watch_ctx;
-    m_watch_ctx = NULL;
+    std::swap(on_finish, m_watch_ctx);
   }
 
-  if (on_finish != NULL) {
+  if (on_finish != nullptr) {
     on_finish->complete(r);
   }
 
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index a1479d1..ce1a220 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -678,7 +678,8 @@ int librados::IoCtxImpl::operate(const object_t& oid, ::ObjectOperation *o,
   Context *oncommit = new C_SafeCond(&mylock, &cond, &done, &r);
 
   int op = o->ops[0].op.op;
-  ldout(client->cct, 10) << ceph_osd_op_name(op) << " oid=" << oid << " nspace=" << oloc.nspace << dendl;
+  ldout(client->cct, 10) << ceph_osd_op_name(op) << " oid=" << oid
+			 << " nspace=" << oloc.nspace << dendl;
   Objecter::Op *objecter_op = objecter->prepare_mutate_op(oid, oloc,
 							  *o, snapc, ut, flags,
 							  NULL, oncommit, &ver);
@@ -745,7 +746,7 @@ int librados::IoCtxImpl::aio_operate_read(const object_t &oid,
   Objecter::Op *objecter_op = objecter->prepare_read_op(oid, oloc,
 		 *o, snap_seq, pbl, flags,
 		 onack, &c->objver);
-  c->tid = objecter->op_submit(objecter_op);
+  objecter->op_submit(objecter_op, &c->tid);
   return 0;
 }
 
@@ -764,8 +765,10 @@ int librados::IoCtxImpl::aio_operate(const object_t& oid,
   c->io = this;
   queue_aio_write(c);
 
-  c->tid = objecter->mutate(oid, oloc, *o, snap_context, ut, flags, onack,
-			    oncommit, &c->objver);
+  Objecter::Op *op = objecter->prepare_mutate_op(
+    oid, oloc, *o, snap_context, ut, flags, onack,
+    oncommit, &c->objver);
+  objecter->op_submit(op, &c->tid);
 
   return 0;
 }
@@ -783,9 +786,11 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
   c->io = this;
   c->blp = pbl;
 
-  c->tid = objecter->read(oid, oloc,
-		 off, len, snapid, pbl, 0,
-		 onack, &c->objver);
+  Objecter::Op *o = objecter->prepare_read_op(
+    oid, oloc,
+    off, len, snapid, pbl, 0,
+    onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -804,10 +809,11 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
   c->bl.push_back(buffer::create_static(len, buf));
   c->blp = &c->bl;
 
-  c->tid = objecter->read(oid, oloc,
-		 off, len, snapid, &c->bl, 0,
-		 onack, &c->objver);
-
+  Objecter::Op *o = objecter->prepare_read_op(
+    oid, oloc,
+    off, len, snapid, &c->bl, 0,
+    onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -839,9 +845,11 @@ int librados::IoCtxImpl::aio_sparse_read(const object_t oid,
 
   onack->m_ops.sparse_read(off, len, m, data_bl, NULL);
 
-  c->tid = objecter->read(oid, oloc,
-		 onack->m_ops, snap_seq, NULL, 0,
-		 onack, &c->objver);
+  Objecter::Op *o = objecter->prepare_read_op(
+    oid, oloc,
+    onack->m_ops, snap_seq, NULL, 0,
+    onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -864,9 +872,11 @@ int librados::IoCtxImpl::aio_write(const object_t &oid, AioCompletionImpl *c,
   c->io = this;
   queue_aio_write(c);
 
-  c->tid = objecter->write(oid, oloc,
-		  off, len, snapc, bl, ut, 0,
-		  onack, onsafe, &c->objver);
+  Objecter::Op *o = objecter->prepare_write_op(
+    oid, oloc,
+    off, len, snapc, bl, ut, 0,
+    onack, onsafe, &c->objver);
+  objecter->op_submit(o, &c->tid);
 
   return 0;
 }
@@ -888,9 +898,11 @@ int librados::IoCtxImpl::aio_append(const object_t &oid, AioCompletionImpl *c,
   c->io = this;
   queue_aio_write(c);
 
-  c->tid = objecter->append(oid, oloc,
-		   len, snapc, bl, ut, 0,
-		   onack, onsafe, &c->objver);
+  Objecter::Op *o = objecter->prepare_append_op(
+    oid, oloc,
+    len, snapc, bl, ut, 0,
+    onack, onsafe, &c->objver);
+  objecter->op_submit(o, &c->tid);
 
   return 0;
 }
@@ -913,9 +925,11 @@ int librados::IoCtxImpl::aio_write_full(const object_t &oid,
   c->io = this;
   queue_aio_write(c);
 
-  c->tid = objecter->write_full(oid, oloc,
-		       snapc, bl, ut, 0,
-		       onack, onsafe, &c->objver);
+  Objecter::Op *o = objecter->prepare_write_full_op(
+    oid, oloc,
+    snapc, bl, ut, 0,
+    onack, onsafe, &c->objver);
+  objecter->op_submit(o, &c->tid);
 
   return 0;
 }
@@ -934,9 +948,11 @@ int librados::IoCtxImpl::aio_remove(const object_t &oid, AioCompletionImpl *c)
   c->io = this;
   queue_aio_write(c);
 
-  c->tid = objecter->remove(oid, oloc,
-		   snapc, ut, 0,
-		   onack, onsafe, &c->objver);
+  Objecter::Op *o = objecter->prepare_remove_op(
+    oid, oloc,
+    snapc, ut, 0,
+    onack, onsafe, &c->objver);
+  objecter->op_submit(o, &c->tid);
 
   return 0;
 }
@@ -948,9 +964,11 @@ int librados::IoCtxImpl::aio_stat(const object_t& oid, AioCompletionImpl *c,
   C_aio_stat_Ack *onack = new C_aio_stat_Ack(c, pmtime);
 
   c->io = this;
-  c->tid = objecter->stat(oid, oloc,
-			  snap_seq, psize, &onack->mtime, 0,
-			  onack, &c->objver);
+  Objecter::Op *o = objecter->prepare_stat_op(
+    oid, oloc,
+    snap_seq, psize, &onack->mtime, 0,
+    onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
 
   return 0;
 }
@@ -961,9 +979,11 @@ int librados::IoCtxImpl::aio_stat2(const object_t& oid, AioCompletionImpl *c,
   C_aio_stat2_Ack *onack = new C_aio_stat2_Ack(c, pts);
 
   c->io = this;
-  c->tid = objecter->stat(oid, oloc,
-			  snap_seq, psize, &onack->mtime, 0,
-			  onack, &c->objver);
+  Objecter::Op *o = objecter->prepare_stat_op(
+    oid, oloc,
+    snap_seq, psize, &onack->mtime, 0,
+    onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
 
   return 0;
 }
@@ -984,7 +1004,9 @@ int librados::IoCtxImpl::hit_set_list(uint32_t hash, AioCompletionImpl *c,
   ::ObjectOperation rd;
   rd.hit_set_ls(pls, NULL);
   object_locator_t oloc(poolid);
-  c->tid = objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL);
+  Objecter::Op *o = objecter->prepare_pg_read_op(
+    hash, oloc, rd, NULL, 0, onack, NULL, NULL);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -999,7 +1021,9 @@ int librados::IoCtxImpl::hit_set_get(uint32_t hash, AioCompletionImpl *c,
   ::ObjectOperation rd;
   rd.hit_set_get(ceph::real_clock::from_time_t(stamp), pbl, 0);
   object_locator_t oloc(poolid);
-  c->tid = objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL);
+  Objecter::Op *o = objecter->prepare_pg_read_op(
+    hash, oloc, rd, NULL, 0, onack, NULL, NULL);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -1041,8 +1065,10 @@ int librados::IoCtxImpl::get_inconsistent_objects(const pg_t& pg,
   ::ObjectOperation op;
   op.scrub_ls(start_after, max_to_get, objects, interval, nullptr);
   object_locator_t oloc{poolid, pg.ps()};
-  c->tid = objecter->pg_read(oloc.hash, oloc, op, nullptr, CEPH_OSD_FLAG_PGOP, onack,
-			     nullptr, nullptr);
+  Objecter::Op *o = objecter->prepare_pg_read_op(
+    oloc.hash, oloc, op, nullptr, CEPH_OSD_FLAG_PGOP, onack,
+    nullptr, nullptr);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -1060,8 +1086,10 @@ int librados::IoCtxImpl::get_inconsistent_snapsets(const pg_t& pg,
   ::ObjectOperation op;
   op.scrub_ls(start_after, max_to_get, snapsets, interval, nullptr);
   object_locator_t oloc{poolid, pg.ps()};
-  c->tid = objecter->pg_read(oloc.hash, oloc, op, nullptr, CEPH_OSD_FLAG_PGOP, onack,
-			     nullptr, nullptr);
+  Objecter::Op *o = objecter->prepare_pg_read_op(
+    oloc.hash, oloc, op, nullptr, CEPH_OSD_FLAG_PGOP, onack,
+    nullptr, nullptr);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
@@ -1119,8 +1147,9 @@ int librados::IoCtxImpl::aio_exec(const object_t& oid, AioCompletionImpl *c,
   ::ObjectOperation rd;
   prepare_assert_ops(&rd);
   rd.call(cls, method, inbl);
-  c->tid = objecter->read(oid, oloc, rd, snap_seq, outbl, 0, onack, &c->objver);
-
+  Objecter::Op *o = objecter->prepare_read_op(
+    oid, oloc, rd, snap_seq, outbl, 0, onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
   return 0;
 }
 
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 770e871..fa05103 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -52,7 +52,7 @@ namespace {
 class ThreadPoolSingleton : public ThreadPool {
 public:
   explicit ThreadPoolSingleton(CephContext *cct)
-    : ThreadPool(cct, "librbd::thread_pool", "tp_librbd", cct->_conf->rbd_op_threads,
+    : ThreadPool(cct, "librbd::thread_pool", "tp_librbd", 1,
                  "rbd_op_threads") {
     start();
   }
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
index df30cb3..265a869 100644
--- a/src/librbd/Journal.cc
+++ b/src/librbd/Journal.cc
@@ -827,7 +827,8 @@ uint64_t Journal<I>::append_io_event(AioCompletion *aio_comp,
                  << "length=" << length << ", "
                  << "flush=" << flush_entry << ", tid=" << tid << dendl;
 
-  Context *on_safe = new C_IOEventSafe(this, tid);
+  Context *on_safe = create_async_context_callback(
+    m_image_ctx, new C_IOEventSafe(this, tid));
   if (flush_entry) {
     future.flush(on_safe);
   } else {
@@ -942,8 +943,9 @@ void Journal<I>::commit_op_event(uint64_t op_tid, int r) {
     op_finish_future = m_journaler->append(m_tag_tid, bl);
   }
 
-  op_finish_future.flush(new C_OpEventSafe(this, op_tid, op_start_future,
-                                           op_finish_future));
+  op_finish_future.flush(create_async_context_callback(
+    m_image_ctx, new C_OpEventSafe(this, op_tid, op_start_future,
+                                   op_finish_future)));
 }
 
 template <typename I>
@@ -971,7 +973,7 @@ void Journal<I>::flush_event(uint64_t tid, Context *on_safe) {
   }
 
   if (future.is_valid()) {
-    future.flush(NULL);
+    future.flush(nullptr);
   }
 }
 
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 6ace24f..983e622 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -312,13 +312,10 @@ int mirror_image_disable_internal(ImageCtx *ictx, bool force) {
     return r;
   }
 
-  if (!is_primary) {
-    if (!force) {
-      lderr(cct) << "Mirrored image is not the primary, add force option to"
-        " disable mirroring" << dendl;
-      return -EINVAL;
-    }
-    goto remove_mirroring_image;
+  if (!is_primary && !force) {
+    lderr(cct) << "Mirrored image is not the primary, add force option to"
+                  " disable mirroring" << dendl;
+    return -EINVAL;
   }
 
   mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
@@ -329,6 +326,10 @@ int mirror_image_disable_internal(ImageCtx *ictx, bool force) {
     return r;
   }
 
+  if (!is_primary) {
+    goto remove_mirroring_image;
+  }
+
   r = MirroringWatcher<>::notify_image_updated(
     ictx->md_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLING,
     ictx->id, mirror_image_internal.global_image_id);
@@ -341,6 +342,7 @@ int mirror_image_disable_internal(ImageCtx *ictx, bool force) {
   header_oid = ::journal::Journaler::header_oid(ictx->id);
 
   while(true) {
+    clients.clear();
     r = cls::journal::client::client_list(ictx->md_ctx, header_oid, &clients);
     if (r < 0) {
       lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl;
diff --git a/src/logrotate.conf b/src/logrotate.conf
index 08ad4b4..061965b 100644
--- a/src/logrotate.conf
+++ b/src/logrotate.conf
@@ -4,7 +4,7 @@
     compress
     sharedscripts
     postrotate
-        killall -q -1 ceph-mon ceph-mds ceph-osd radosgw || true
+        killall -q -1 ceph-mon ceph-mds ceph-osd ceph-fuse radosgw || true
     endscript
     missingok
     notifempty
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 16f20ba..9a07b91 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -443,6 +443,13 @@ void Beacon::notify_health(MDSRank const *mds)
       large_completed_requests_metrics.clear();
     }
   }
+
+  // Report a health warning if we are readonly
+  if (mds->mdcache->is_readonly()) {
+    MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
+                      "MDS in read-only mode");
+    health.metrics.push_back(m);
+  }
 }
 
 MDSMap::DaemonState Beacon::get_want_state() const
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 423be96..718c0bc 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1143,13 +1143,7 @@ void CInode::store_backtrace(MDSInternalContextBase *fin, int op_prio)
 
   auth_pin(this);
 
-  int64_t pool;
-  if (is_dir()) {
-    pool = mdcache->mds->mdsmap->get_metadata_pool();
-  } else {
-    pool = inode.layout.pool_id;
-  }
-
+  const int64_t pool = get_backtrace_pool();
   inode_backtrace_t bt;
   build_backtrace(pool, bt);
   bufferlist parent_bl;
@@ -1212,8 +1206,10 @@ void CInode::_stored_backtrace(int r, version_t v, Context *fin)
 {
   if (r < 0) {
     dout(1) << "store backtrace error " << r << " v " << v << dendl;
-    mdcache->mds->clog->error() << "failed to store backtrace on dir ino "
-				<< ino() << " object, errno " << r << "\n";
+    mdcache->mds->clog->error() << "failed to store backtrace on ino "
+				<< ino() << " object"
+                                << ", pool " << get_backtrace_pool()
+                                << ", errno " << r << "\n";
     mdcache->mds->handle_write_error(r);
     return;
   }
@@ -1229,13 +1225,7 @@ void CInode::_stored_backtrace(int r, version_t v, Context *fin)
 
 void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
 {
-  int64_t pool;
-  if (is_dir())
-    pool = mdcache->mds->mdsmap->get_metadata_pool();
-  else
-    pool = inode.layout.pool_id;
-
-  mdcache->fetch_backtrace(inode.ino, pool, *backtrace, fin);
+  mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin);
 }
 
 void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
@@ -3725,12 +3715,7 @@ void CInode::validate_disk_state(CInode::validated_data *results,
     void fetch_backtrace_and_tag(CInode *in, std::string tag,
                                  Context *fin, int *bt_r, bufferlist *bt)
     {
-      int64_t pool;
-      if (in->is_dir())
-        pool = in->mdcache->mds->mdsmap->get_metadata_pool();
-      else
-        pool = in->inode.layout.pool_id;
-
+      const int64_t pool = in->get_backtrace_pool();
       object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
 
       ObjectOperation fetch;
@@ -3788,11 +3773,7 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       results->performed_validation = true; // at least, some of it!
       results->backtrace.checked = true;
 
-      int64_t pool;
-      if (in->is_dir())
-        pool = in->mdcache->mds->mdsmap->get_metadata_pool();
-      else
-        pool = in->inode.layout.pool_id;
+      const int64_t pool = in->get_backtrace_pool();
       inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
       in->build_backtrace(pool, memory_backtrace);
       bool equivalent, divergent;
@@ -4302,3 +4283,15 @@ void CInode::scrub_finished(MDSInternalContextBase **c) {
     clog->info() << "scrub complete with tag '" << scrub_infop->header->tag << "'";
   }
 }
+
+int64_t CInode::get_backtrace_pool() const
+{
+  if (is_dir()) {
+    return mdcache->mds->mdsmap->get_metadata_pool();
+  } else {
+    // Files are required to have an explicit layout that specifies
+    // a pool
+    assert(inode.layout.pool_id != -1);
+    return inode.layout.pool_id;
+  }
+}
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 01f6797..8f27bf0 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -769,6 +769,15 @@ public:
   void store_backtrace(MDSInternalContextBase *fin, int op_prio=-1);
   void _stored_backtrace(int r, version_t v, Context *fin);
   void fetch_backtrace(Context *fin, bufferlist *backtrace);
+protected:
+  /**
+   * Return the pool ID where we currently write backtraces for
+   * this inode (in addition to inode.old_pools)
+   *
+   * @returns a pool ID >=0
+   */
+  int64_t get_backtrace_pool() const;
+public:
   void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
   void clear_dirty_parent();
   void verify_diri_backtrace(bufferlist &bl, int err);
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
index 29f94dc..73e425d 100644
--- a/src/mds/FSMap.cc
+++ b/src/mds/FSMap.cc
@@ -35,6 +35,11 @@ void FSMap::dump(Formatter *f) const
   compat.dump(f);
   f->close_section();
 
+  f->open_object_section("feature flags");
+  f->dump_bool("enable_multiple", enable_multiple);
+  f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
+  f->close_section();
+
   f->open_array_section("standbys");
   for (const auto &i : standby_daemons) {
     f->open_object_section("info");
@@ -76,8 +81,9 @@ void FSMap::generate_test_instances(list<FSMap*>& ls)
 void FSMap::print(ostream& out) const
 {
   out << "e" << epoch << std::endl;
-  out << "enable_multiple: " << enable_multiple << std::endl;
-  out << "compat: " << enable_multiple << std::endl;
+  out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
+      << ever_enabled_multiple << std::endl;
+  out << "compat: " << compat << std::endl;
   out << " " << std::endl;
 
   if (filesystems.empty()) {
@@ -231,7 +237,7 @@ void FSMap::get_health(list<pair<health_status_t,string> >& summary,
 void FSMap::encode(bufferlist& bl, uint64_t features) const
 {
   if (features & CEPH_FEATURE_SERVER_JEWEL) {
-    ENCODE_START(6, 6, bl);
+    ENCODE_START(7, 6, bl);
     ::encode(epoch, bl);
     ::encode(next_filesystem_id, bl);
     ::encode(legacy_client_fscid, bl);
@@ -245,6 +251,7 @@ void FSMap::encode(bufferlist& bl, uint64_t features) const
     ::encode(mds_roles, bl);
     ::encode(standby_daemons, bl, features);
     ::encode(standby_epochs, bl);
+    ::encode(ever_enabled_multiple, bl);
     ENCODE_FINISH(bl);
   } else {
     if (filesystems.empty()) {
@@ -280,7 +287,7 @@ void FSMap::decode(bufferlist::iterator& p)
   // MDSMonitor to store an FSMap instead of an MDSMap was
   // 5, so anything older than 6 is decoded as an MDSMap,
   // and anything newer is decoded as an FSMap.
-  DECODE_START_LEGACY_COMPAT_LEN_16(6, 4, 4, p);
+  DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
   if (struct_v < 6) {
     // Decoding an MDSMap (upgrade)
     ::decode(epoch, p);
@@ -334,11 +341,28 @@ void FSMap::decode(bufferlist::iterator& p)
     if (ev >= 4)
       ::decode(legacy_mds_map.last_failure_osd_epoch, p);
     if (ev >= 6) {
-      ::decode(legacy_mds_map.ever_allowed_snaps, p);
-      ::decode(legacy_mds_map.explicitly_allowed_snaps, p);
+      if (ev < 10) {
+	// previously this was a bool about snaps, not a flag map
+	bool flag;
+	::decode(flag, p);
+	legacy_mds_map.ever_allowed_features = flag ?
+	  CEPH_MDSMAP_ALLOW_SNAPS : 0;
+	::decode(flag, p);
+	legacy_mds_map.explicitly_allowed_features = flag ?
+	  CEPH_MDSMAP_ALLOW_SNAPS : 0;
+	if (legacy_mds_map.max_mds > 1) {
+	  legacy_mds_map.set_multimds_allowed();
+	}
+      } else {
+	::decode(legacy_mds_map.ever_allowed_features, p);
+	::decode(legacy_mds_map.explicitly_allowed_features, p);
+      }
     } else {
-      legacy_mds_map.ever_allowed_snaps = true;
-      legacy_mds_map.explicitly_allowed_snaps = false;
+      legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
+      legacy_mds_map.explicitly_allowed_features = 0;
+      if (legacy_mds_map.max_mds > 1) {
+	legacy_mds_map.set_multimds_allowed();
+      }
     }
     if (ev >= 7)
       ::decode(legacy_mds_map.inline_data_enabled, p);
@@ -416,6 +440,7 @@ void FSMap::decode(bufferlist::iterator& p)
     ::decode(mds_roles, p);
     ::decode(standby_daemons, p);
     ::decode(standby_epochs, p);
+    ::decode(ever_enabled_multiple, p);
   }
 
   DECODE_FINISH(p);
@@ -512,7 +537,8 @@ mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) cons
   return result;
 }
 
-mds_gid_t FSMap::find_unused(bool force_standby_active) const {
+mds_gid_t FSMap::find_unused(fs_cluster_id_t fscid,
+			     bool force_standby_active) const {
   for (const auto &i : standby_daemons) {
     const auto &gid = i.first;
     const auto &info = i.second;
@@ -521,6 +547,10 @@ mds_gid_t FSMap::find_unused(bool force_standby_active) const {
     if (info.laggy() || info.rank >= 0)
       continue;
 
+    if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
+        info.standby_for_fscid != fscid)
+      continue;
+
     if ((info.standby_for_rank == MDSMap::MDS_NO_STANDBY_PREF ||
          info.standby_for_rank == MDSMap::MDS_MATCHED_ACTIVE ||
          (info.standby_for_rank == MDSMap::MDS_STANDBY_ANY
@@ -537,7 +567,7 @@ mds_gid_t FSMap::find_replacement_for(mds_role_t role, const std::string& name,
   if (standby)
     return standby;
   else
-    return find_unused(force_standby_active);
+    return find_unused(role.fscid, force_standby_active);
 }
 
 void FSMap::sanity() const
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
index 1f6b069..d14e365 100644
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -95,6 +95,7 @@ protected:
   fs_cluster_id_t legacy_client_fscid;
   CompatSet compat;
   bool enable_multiple;
+  bool ever_enabled_multiple; // < the cluster had multiple MDSes enabled once
 
   std::map<fs_cluster_id_t, std::shared_ptr<Filesystem> > filesystems;
 
@@ -115,7 +116,7 @@ public:
       next_filesystem_id(FS_CLUSTER_ID_ANONYMOUS + 1),
       legacy_client_fscid(FS_CLUSTER_ID_NONE),
       compat(get_mdsmap_compat_set_default()),
-      enable_multiple(false)
+      enable_multiple(false), ever_enabled_multiple(false)
   { }
 
   FSMap(const FSMap &rhs)
@@ -125,6 +126,7 @@ public:
       legacy_client_fscid(rhs.legacy_client_fscid),
       compat(rhs.compat),
       enable_multiple(rhs.enable_multiple),
+      ever_enabled_multiple(rhs.ever_enabled_multiple),
       mds_roles(rhs.mds_roles),
       standby_daemons(rhs.standby_daemons),
       standby_epochs(rhs.standby_epochs)
@@ -159,6 +161,9 @@ public:
   void set_enable_multiple(const bool v)
   {
     enable_multiple = v;
+    if (true == v) {
+      ever_enabled_multiple = true;
+    }
   }
 
   bool get_enable_multiple() const
@@ -411,7 +416,7 @@ public:
 
   mds_gid_t find_standby_for(mds_role_t mds, const std::string& name) const;
 
-  mds_gid_t find_unused(bool force_standby_active) const;
+  mds_gid_t find_unused(fs_cluster_id_t fscid, bool force_standby_active) const;
 
   mds_gid_t find_replacement_for(mds_role_t mds, const std::string& name,
                                  bool force_standby_active) const;
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index ddd99b2..84e0e48 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2187,6 +2187,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
 				  utime_t new_mtime)
 {
   assert(in->is_auth());
+  assert(in->is_file());
 
   inode_t *latest = in->get_projected_inode();
   map<client_t, client_writeable_range_t> new_ranges;
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
index 4c42568..99845fb 100644
--- a/src/mds/MDBalancer.cc
+++ b/src/mds/MDBalancer.cc
@@ -333,6 +333,7 @@ double MDBalancer::try_match(mds_rank_t ex, double& maxex,
 
 void MDBalancer::queue_split(CDir *dir)
 {
+  assert(mds->mdsmap->allows_dirfrags());
   split_queue.insert(dir->dirfrag());
 }
 
@@ -984,6 +985,7 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun
 
     // split
     if (g_conf->mds_bal_split_size > 0 &&
+	mds->mdsmap->allows_dirfrags() &&
 	(dir->should_split() ||
 	 (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) ||
 	 (v > g_conf->mds_bal_split_wr && type == META_POP_IWR)) &&
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index d9e5901..152b47b 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -6038,6 +6038,11 @@ void MDCache::identify_files_to_recover(vector<CInode*>& recover_q, vector<CInod
     CInode *in = p->second;
     if (!in->is_auth())
       continue;
+
+    // Only normal files need file size recovery
+    if (!in->is_file()) {
+      continue;
+    }
     
     bool recover = false;
     for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index daa0cb0..a950b0b 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -337,11 +337,19 @@ void MDSDaemon::clean_up_admin_socket()
   admin_socket->unregister_command("dump_blocked_ops");
   admin_socket->unregister_command("dump_historic_ops");
   admin_socket->unregister_command("scrub_path");
+  admin_socket->unregister_command("tag path");
   admin_socket->unregister_command("flush_path");
+  admin_socket->unregister_command("export dir");
+  admin_socket->unregister_command("dump cache");
   admin_socket->unregister_command("session evict");
+  admin_socket->unregister_command("osdmap barrier");
   admin_socket->unregister_command("session ls");
   admin_socket->unregister_command("flush journal");
   admin_socket->unregister_command("force_readonly");
+  admin_socket->unregister_command("get subtrees");
+  admin_socket->unregister_command("dirfrag split");
+  admin_socket->unregister_command("dirfrag merge");
+  admin_socket->unregister_command("dirfrag ls");
   delete asok_hook;
   asok_hook = NULL;
 }
@@ -474,8 +482,21 @@ int MDSDaemon::init(MDSMap::DaemonState wanted_state)
     mds_lock.Unlock();
     return r;
   }
+
+  int rotating_auth_attempts = 0;
+  const int max_rotating_auth_attempts = 10;
+
   while (monc->wait_auth_rotating(30.0) < 0) {
-    derr << "unable to obtain rotating service keys; retrying" << dendl;
+    if (++rotating_auth_attempts <= max_rotating_auth_attempts) {
+      derr << "unable to obtain rotating service keys; retrying" << dendl;
+      continue;
+    }
+    derr << "ERROR: failed to refresh rotating keys, "
+         << "maximum retry time reached." << dendl;
+    mds_lock.Lock();
+    suicide();
+    mds_lock.Unlock();
+    return -ETIMEDOUT;
   }
 
   objecter->start();
@@ -801,6 +822,7 @@ int MDSDaemon::_handle_command(
     if (mds_rank == NULL) {
       r = -EINVAL;
       ss << "MDS not active";
+      goto out;
     }
     // FIXME harmonize `session kill` with admin socket session evict
     int64_t session_id = 0;
@@ -988,6 +1010,7 @@ void MDSDaemon::handle_mds_map(MMDSMap *m)
             // has taken our ID, we don't want to keep restarting and
             // fighting them for the ID.
             suicide();
+            m->put();
             return;
           }
         }
@@ -1244,6 +1267,7 @@ bool MDSDaemon::handle_core_message(Message *m)
     if (mds_rank) {
       mds_rank->handle_osd_map();
     }
+    m->put();
     break;
 
   default:
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 9a7c26f..f66fc7a 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -134,6 +134,8 @@ void MDSMap::dump(Formatter *f) const
 {
   f->dump_int("epoch", epoch);
   f->dump_unsigned("flags", flags);
+  f->dump_unsigned("ever_allowed_features", ever_allowed_features);
+  f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
   f->dump_stream("created") << created;
   f->dump_stream("modified") << modified;
   f->dump_int("tableserver", tableserver);
@@ -547,7 +549,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
   ::encode(cas_pool, bl);
 
   // kclient ignores everything from here
-  __u16 ev = 9;
+  __u16 ev = 10;
   ::encode(ev, bl);
   ::encode(compat, bl);
   ::encode(metadata_pool, bl);
@@ -560,8 +562,8 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
   ::encode(failed, bl);
   ::encode(stopped, bl);
   ::encode(last_failure_osd_epoch, bl);
-  ::encode(ever_allowed_snaps, bl);
-  ::encode(explicitly_allowed_snaps, bl);
+  ::encode(ever_allowed_features, bl);
+  ::encode(explicitly_allowed_features, bl);
   ::encode(inline_data_enabled, bl);
   ::encode(enabled, bl);
   ::encode(fs_name, bl);
@@ -624,11 +626,27 @@ void MDSMap::decode(bufferlist::iterator& p)
   if (ev >= 4)
     ::decode(last_failure_osd_epoch, p);
   if (ev >= 6) {
-    ::decode(ever_allowed_snaps, p);
-    ::decode(explicitly_allowed_snaps, p);
+    if (ev < 10) {
+      // previously this was a bool about snaps, not a flag map
+      bool flag;
+      ::decode(flag, p);
+      ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
+      ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS|CEPH_MDSMAP_ALLOW_DIRFRAGS;
+      ::decode(flag, p);
+      explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
+      if (max_mds > 1) {
+	set_multimds_allowed();
+      }
+    } else {
+      ::decode(ever_allowed_features, p);
+      ::decode(explicitly_allowed_features, p);
+    }
   } else {
-    ever_allowed_snaps = true;
-    explicitly_allowed_snaps = false;
+    ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
+    explicitly_allowed_features = 0;
+    if (max_mds > 1) {
+      set_multimds_allowed();
+    }
   }
   if (ev >= 7)
     ::decode(inline_data_enabled, p);
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 2842f93..bb69a75 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -209,8 +209,8 @@ protected:
   std::map<mds_rank_t, mds_gid_t> up;        // who is in those roles
   std::map<mds_gid_t, mds_info_t> mds_info;
 
-  bool ever_allowed_snaps; //< the cluster has ever allowed snap creation
-  bool explicitly_allowed_snaps; //< the user has explicitly enabled snap creation
+  uint8_t ever_allowed_features; //< bitmap of features the cluster has allowed
+  uint8_t explicitly_allowed_features; //< bitmap of features explicitly enabled 
 
   bool inline_data_enabled;
 
@@ -235,8 +235,8 @@ public:
       cas_pool(-1),
       metadata_pool(0),
       max_mds(0),
-      ever_allowed_snaps(false),
-      explicitly_allowed_snaps(false),
+      ever_allowed_features(0),
+      explicitly_allowed_features(0),
       inline_data_enabled(false),
       cached_up_features(0)
   { }
@@ -259,11 +259,27 @@ public:
 
   void set_snaps_allowed() {
     set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
-    ever_allowed_snaps = true;
-    explicitly_allowed_snaps = true;
+    ever_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
+    explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
   }
-  bool allows_snaps() { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
   void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+  bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+
+  void set_multimds_allowed() {
+    set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS);
+    ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS;
+    explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS;
+  }
+  void clear_multimds_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS); }
+  bool allows_multimds() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS); }
+
+  void set_dirfrags_allowed() {
+    set_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS);
+    ever_allowed_features |= CEPH_MDSMAP_ALLOW_DIRFRAGS;
+    explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_DIRFRAGS;
+  }
+  void clear_dirfrags_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS); }
+  bool allows_dirfrags() const { return test_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS); }
 
   epoch_t get_epoch() const { return epoch; }
   void inc_epoch() { epoch++; }
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 71e4925..feb4897 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -502,7 +502,8 @@ bool MDSRank::_dispatch(Message *m, bool new_msg)
     if (!dir->get_parent_dir()) continue;    // must be linked.
     if (!dir->is_auth()) continue;           // must be auth.
     frag_t fg = dir->get_frag();
-    if (fg == frag_t() || (rand() % (1 << fg.bits()) == 0))
+    if (mdsmap->allows_dirfrags() &&
+	(fg == frag_t() || (rand() % (1 << fg.bits()) == 0)))
       mdcache->split_dir(dir, 1);
     else
       balancer->queue_merge(dir);
@@ -2166,6 +2167,11 @@ bool MDSRank::command_dirfrag_split(
     cmdmap_t cmdmap,
     std::ostream &ss)
 {
+  if (!mdsmap->allows_dirfrags()) {
+    ss << "dirfrags are disallowed by the mds map!";
+    return false;
+  }
+
   int64_t by = 0;
   if (!cmd_getval(g_ceph_context, cmdmap, "bits", by)) {
     ss << "missing bits argument";
@@ -2467,25 +2473,27 @@ bool MDSRankDispatcher::handle_command_legacy(std::vector<std::string> args)
       dout(20) << "try_eval(" << inum << ", " << mask << ")" << dendl;
     } else dout(15) << "inode " << inum << " not in mdcache!" << dendl;
   } else if (args[0] == "fragment_dir") {
-    if (args.size() == 4) {
-      filepath fp(args[1].c_str());
-      CInode *in = mdcache->cache_traverse(fp);
-      if (in) {
-	frag_t fg;
-	if (fg.parse(args[2].c_str())) {
-	  CDir *dir = in->get_dirfrag(fg);
-	  if (dir) {
-	    if (dir->is_auth()) {
-	      int by = atoi(args[3].c_str());
-	      if (by)
-		mdcache->split_dir(dir, by);
-	      else
-		dout(0) << "need to split by >0 bits" << dendl;
-	    } else dout(0) << "dir " << dir->dirfrag() << " not auth" << dendl;
-	  } else dout(0) << "dir " << in->ino() << " " << fg << " dne" << dendl;
-	} else dout(0) << " frag " << args[2] << " does not parse" << dendl;
-      } else dout(0) << "path " << fp << " not found" << dendl;
-    } else dout(0) << "bad syntax" << dendl;
+    if (!mdsmap->allows_dirfrags()) {
+      if (args.size() == 4) {
+	filepath fp(args[1].c_str());
+	CInode *in = mdcache->cache_traverse(fp);
+	if (in) {
+	  frag_t fg;
+	  if (fg.parse(args[2].c_str())) {
+	    CDir *dir = in->get_dirfrag(fg);
+	    if (dir) {
+	      if (dir->is_auth()) {
+		int by = atoi(args[3].c_str());
+		if (by)
+		  mdcache->split_dir(dir, by);
+		else
+		  dout(0) << "need to split by >0 bits" << dendl;
+	      } else dout(0) << "dir " << dir->dirfrag() << " not auth" << dendl;
+	    } else dout(0) << "dir " << in->ino() << " " << fg << " dne" << dendl;
+	  } else dout(0) << " frag " << args[2] << " does not parse" << dendl;
+	} else dout(0) << "path " << fp << " not found" << dendl;
+      } else dout(0) << "bad syntax" << dendl;
+    } else dout(0) << "dirfrags are disallowed by the mds map!" << dendl;
   } else if (args[0] == "merge_dir") {
     if (args.size() == 3) {
       filepath fp(args[1].c_str());
diff --git a/src/mds/events/ESessions.h b/src/mds/events/ESessions.h
index 35a6ce7..a9a834c 100644
--- a/src/mds/events/ESessions.h
+++ b/src/mds/events/ESessions.h
@@ -28,7 +28,7 @@ public:
   map<client_t,entity_inst_t> client_map;
   bool old_style_encode;
 
-  ESessions() : LogEvent(EVENT_SESSIONS), old_style_encode(false) { }
+  ESessions() : LogEvent(EVENT_SESSIONS), cmapv(0), old_style_encode(false) { }
   ESessions(version_t pv, map<client_t,entity_inst_t>& cm) :
     LogEvent(EVENT_SESSIONS),
     cmapv(pv),
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 64d4d2a..0f27971 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -572,6 +572,25 @@ void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
    */
   in->oldest_snap = oldest_snap;
   in->decode_snap_blob(snapbl);
+
+  /*
+   * In case there was anything malformed in the journal that we are
+   * replaying, do sanity checks on the inodes we're replaying and
+   * go damaged instead of letting any trash into a live cache
+   */
+  if (in->is_file()) {
+    // Files must have valid layouts with a pool set
+    if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
+      dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
+              << ": " << in->inode.layout << dendl;
+      std::ostringstream oss;
+      oss << "Invalid layout for inode 0x" << std::hex << in->inode.ino
+          << std::dec << " in journal";
+      mds->clog->error() << oss.str();
+      mds->damaged();
+      assert(0);  // Should be unreachable because damaged() calls respawn()
+    }
+  }
 }
 
 // EMetaBlob::remotebit
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index a155075..727aaad 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -37,7 +37,8 @@ enum mds_metric_t {
   MDS_HEALTH_CLIENT_LATE_RELEASE_MANY,
   MDS_HEALTH_CLIENT_OLDEST_TID,
   MDS_HEALTH_CLIENT_OLDEST_TID_MANY,
-  MDS_HEALTH_DAMAGE
+  MDS_HEALTH_DAMAGE,
+  MDS_HEALTH_READ_ONLY
 };
 
 /**
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index c5ed1a2..00f06a0 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -70,6 +70,11 @@ template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
 
+static const string EXPERIMENTAL_WARNING("Warning! This feature is experimental."
+"It may cause problems up to and including data loss."
+"Consult the documentation at ceph.com, and if unsure, do not proceed."
+"Add --yes-i-really-mean-it if you are certain.");
+
 static const string MDS_METADATA_PREFIX("mds_metadata");
 
 
@@ -1508,6 +1513,9 @@ class FlagSetHandler : public FileSystemCommandHandler
     string flag_val;
     cmd_getval(g_ceph_context, cmdmap, "val", flag_val);
 
+    string confirm;
+    cmd_getval(g_ceph_context, cmdmap, "confirm", confirm);
+
     if (flag_name == "enable_multiple") {
       bool flag_bool = false;
       int r = parse_bool(flag_val, &flag_bool, ss);
@@ -1521,7 +1529,9 @@ class FlagSetHandler : public FileSystemCommandHandler
         ss << "Multiple-filesystems are forbidden until all mons are updated";
         return -EINVAL;
       }
-
+      if (confirm != "--yes-i-really-mean-it") {
+	ss << EXPERIMENTAL_WARNING;
+      }
       fsmap.set_enable_multiple(flag_bool);
       return 0;
     } else {
@@ -1740,6 +1750,17 @@ int MDSMonitor::management_command(
     // Persist the new FSMap
     pending_fsmap.filesystems[new_fs->fscid] = new_fs;
     return 0;
+  } else if (prefix == "fs set_default") {
+    string fs_name;
+    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    auto fs = pending_fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+        ss << "filesystem '" << fs_name << "' does not exist";
+        return -ENOENT;
+    }
+
+    pending_fsmap.legacy_client_fscid = fs->fscid;
+    return 0;
   } else {
     return -ENOSYS;
   }
@@ -1812,6 +1833,11 @@ public:
       if (interr.length()) {
 	return -EINVAL;
       }
+      if (!fs->mds_map.allows_multimds() && n > fs->mds_map.get_max_mds() &&
+	  n > 1) {
+	ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
+	return -EINVAL;
+      }
       if (n > MAX_MDS) {
         ss << "may not have more than " << MAX_MDS << " MDS ranks";
         return -EINVAL;
@@ -1833,7 +1859,7 @@ public:
 	string confirm;
 	if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
 	    confirm != "--yes-i-really-mean-it") {
-	  ss << "inline data is new and experimental; you must specify --yes-i-really-mean-it";
+	  ss << EXPERIMENTAL_WARNING;
 	  return -EPERM;
 	}
 	ss << "inline data enabled";
@@ -1892,7 +1918,7 @@ public:
 	string confirm;
 	if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
 	    confirm != "--yes-i-really-mean-it") {
-	  ss << "Snapshots are unstable and will probably break your FS! Set to --yes-i-really-mean-it if you are sure you want to enable them";
+	  ss << EXPERIMENTAL_WARNING;
 	  return -EPERM;
 	}
         fsmap.modify_filesystem(
@@ -1903,6 +1929,64 @@ public:
         });
 	ss << "enabled new snapshots";
       }
+    } else if (var == "allow_multimds") {
+      bool enable_multimds = false;
+      int r = parse_bool(val, &enable_multimds, ss);
+      if (r != 0) {
+	return r;
+      }
+
+      if (!enable_multimds) {
+	fsmap.modify_filesystem(fs->fscid,
+	     [](std::shared_ptr<Filesystem> fs)
+		{
+		  fs->mds_map.clear_multimds_allowed();
+		});
+	ss << "disallowed increasing the cluster size past 1";
+      } else {
+	string confirm;
+	if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
+	    confirm != "--yes-i-really-mean-it") {
+	  ss << EXPERIMENTAL_WARNING;
+	  return -EPERM;
+	}
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.set_multimds_allowed();
+        });
+	ss << "enabled creation of more than 1 active MDS";
+      }
+    } else if (var == "allow_dirfrags") {
+      bool enable_dirfrags = false;
+      int r = parse_bool(val, &enable_dirfrags, ss);
+      if (r != 0) {
+	return r;
+      }
+
+      if (!enable_dirfrags) {
+	fsmap.modify_filesystem(fs->fscid,
+	     [](std::shared_ptr<Filesystem> fs)
+		{
+		  fs->mds_map.clear_dirfrags_allowed();
+		});
+	ss << "disallowed new directory fragmentation";
+      } else {
+	string confirm;
+	if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
+	    confirm != "--yes-i-really-mean-it") {
+	  ss << EXPERIMENTAL_WARNING;
+	  return -EPERM;
+	}
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.set_dirfrags_allowed();
+        });
+	ss << "enabled directory fragmentation";
+      }
     } else if (var == "cluster_down") {
       bool is_down = false;
       int r = parse_bool(val, &is_down, ss);
@@ -2329,6 +2413,17 @@ int MDSMonitor::legacy_filesystem_command(
     if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds < 0) {
       return -EINVAL;
     }
+
+    const MDSMap& mdsmap =
+      pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
+      
+    if (!mdsmap.allows_multimds() &&
+	maxmds > mdsmap.get_max_mds() &&
+	maxmds > 1) {
+      ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
+      return -EINVAL;
+    }
+
     if (maxmds > MAX_MDS) {
       ss << "may not have more than " << MAX_MDS << " MDS ranks";
       return -EINVAL;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index c7e923f..b3d8e14 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -329,7 +329,8 @@ COMMAND("mds set_max_mds " \
 	"name=maxmds,type=CephInt,range=0", \
 	"set max MDS index", "mds", "rw", "cli,rest")
 COMMAND("mds set " \
-	"name=var,type=CephChoices,strings=max_mds|max_file_size|allow_new_snaps|inline_data " \
+	"name=var,type=CephChoices,strings=max_mds|max_file_size"
+	"|allow_new_snaps|inline_data|allow_multimds|allow_dirfrags " \
 	"name=val,type=CephString "					\
 	"name=confirm,type=CephString,req=false",			\
 	"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
@@ -397,12 +398,13 @@ COMMAND("fs get name=fs_name,type=CephString", \
 COMMAND("fs set " \
 	"name=fs_name,type=CephString " \
 	"name=var,type=CephChoices,strings=max_mds|max_file_size"
-        "|allow_new_snaps|inline_data|cluster_down " \
+        "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags " \
 	"name=val,type=CephString "					\
 	"name=confirm,type=CephString,req=false",			\
 	"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
 COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
-        "name=val,type=CephString", \
+        "name=val,type=CephString " \
+	"name=confirm,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"Set a global CephFS flag", \
 	"fs", "rw", "cli,rest")
 COMMAND("fs add_data_pool name=fs_name,type=CephString " \
@@ -411,6 +413,9 @@ COMMAND("fs add_data_pool name=fs_name,type=CephString " \
 COMMAND("fs rm_data_pool name=fs_name,type=CephString " \
 	"name=pool,type=CephString", \
 	"remove data pool <pool>", "mds", "rw", "cli,rest")
+COMMAND("fs set_default name=fs_name,type=CephString", \
+	"set the default to the named filesystem", \
+	"fs", "rw", "cli,rest")
 
 /*
  * Monmap commands
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 8114154..f9cf0fd 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2564,7 +2564,8 @@ void OSDMonitor::send_incremental(epoch_t first,
   }
 
   while (first <= osdmap.get_epoch()) {
-    epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch());
+    epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
+		       osdmap.get_epoch());
     MOSDMap *m = build_incremental(first, last);
 
     if (req) {
@@ -2955,6 +2956,16 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
 
+    // Not using 'sortbitwise' and should be?
+    if (g_conf->mon_warn_on_no_sortbitwise &&
+	!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
+	(osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
+	 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
+      ostringstream ss;
+      ss << "no legacy OSD present but 'sortbitwise' flag is not set";
+      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+    }
+
     // Warn if 'mon_osd_down_out_interval' is set to zero.
     // Having this option set to zero on the leader acts much like the
     // 'noout' flag.  It's hard to figure out what's going wrong with clusters
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index f319e76..d03ab3b 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -86,6 +86,7 @@ ObjectStore *ObjectStore::create(CephContext *cct,
 }
 
 int ObjectStore::probe_block_device_fsid(
+  CephContext *cct,
   const string& path,
   uuid_d *fsid)
 {
@@ -95,14 +96,20 @@ int ObjectStore::probe_block_device_fsid(
   // first try bluestore -- it has a crc on its header and will fail
   // reliably.
   r = BlueStore::get_block_device_fsid(path, fsid);
-  if (r == 0)
+  if (r == 0) {
+    lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "
+			  << *fsid << dendl;
     return r;
+  }
 #endif
 
   // okay, try FileStore (journal).
   r = FileStore::get_block_device_fsid(path, fsid);
-  if (r == 0)
+  if (r == 0) {
+    lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, "
+			  << *fsid << dendl;
     return r;
+  }
 
   return -EINVAL;
 }
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index c561d31..93ae4bb 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -119,8 +119,10 @@ public:
    * @param path path to device
    * @param fsid [out] osd uuid
    */
-  static int probe_block_device_fsid(const string& path,
-				     uuid_d *fsid);
+  static int probe_block_device_fsid(
+    CephContext *cct,
+    const string& path,
+    uuid_d *fsid);
 
   Logger *logger;
 
@@ -1926,7 +1928,15 @@ public:
   virtual int fsck() {
     return -EOPNOTSUPP;
   }
-  virtual unsigned get_max_object_name_length() = 0;
+
+  /**
+   * Returns 0 if the hobject is valid, -error otherwise
+   *
+   * Errors:
+   * -ENAMETOOLONG: locator/namespace/name too large
+   */
+  virtual int validate_hobject_key(const hobject_t &obj) const = 0;
+
   virtual unsigned get_max_attr_name_length() = 0;
   virtual int mkfs() = 0;  // wipe
   virtual int mkjournal() = 0; // journal only
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 8a9118f..41ae52a 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -5,6 +5,7 @@
 
 #include "common/debug.h"
 #include "common/errno.h"
+#include "common/perf_counters.h"
 #include "BlockDevice.h"
 #include "Allocator.h"
 #include "StupidAllocator.h"
@@ -15,23 +16,76 @@
 #define dout_prefix *_dout << "bluefs "
 
 BlueFS::BlueFS()
-  : ino_last(0),
+  : logger(NULL),
+    ino_last(0),
     log_seq(0),
-    log_writer(NULL)
+    log_writer(NULL),
+    bdev(MAX_BDEV),
+    ioc(MAX_BDEV),
+    block_all(MAX_BDEV),
+    block_total(MAX_BDEV, 0)
 {
 }
 
 BlueFS::~BlueFS()
 {
   for (auto p : bdev) {
-    p->close();
-    delete p;
+    if (p) {
+      p->close();
+      delete p;
+    }
   }
   for (auto p : ioc) {
     delete p;
   }
 }
 
+void BlueFS::_init_logger()
+{
+  PerfCountersBuilder b(g_ceph_context, "BlueFS",
+                        l_bluefs_first, l_bluefs_last);
+  b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", "Bytes gifted from BlueStore");
+  b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", "Bytes reclaimed by BlueStore");
+  b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", "Total bytes (main db device)");
+  b.add_u64(l_bluefs_db_free_bytes, "db_free_bytes", "Free bytes (main db device)");
+  b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", "Total bytes (wal device)");
+  b.add_u64(l_bluefs_wal_free_bytes, "wal_free_bytes", "Free bytes (wal device)");
+  b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", "Total bytes (slow device)");
+  b.add_u64(l_bluefs_slow_free_bytes, "slow_free_bytes", "Free bytes (slow device)");
+  b.add_u64(l_bluefs_num_files, "num_files", "File count");
+  b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log");
+  b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", "Compactions of the metadata log");
+  b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", "Bytes written to the metadata log");
+  logger = b.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void BlueFS::_shutdown_logger()
+{
+  g_ceph_context->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+void BlueFS::_update_logger_stats()
+{
+  // we must be holding the lock
+  logger->set(l_bluefs_num_files, file_map.size());
+  logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
+
+  if (alloc[BDEV_WAL]) {
+    logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
+    logger->set(l_bluefs_wal_free_bytes, alloc[BDEV_WAL]->get_free());
+  }
+  if (alloc[BDEV_DB]) {
+    logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
+    logger->set(l_bluefs_db_free_bytes, alloc[BDEV_DB]->get_free());
+  }
+  if (alloc[BDEV_SLOW]) {
+    logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
+    logger->set(l_bluefs_slow_free_bytes, alloc[BDEV_SLOW]->get_free());
+  }
+}
+
 /*static void aio_cb(void *priv, void *priv2)
 {
   BlueFS *fs = static_cast<BlueFS*>(priv);
@@ -42,7 +96,8 @@ BlueFS::~BlueFS()
 int BlueFS::add_block_device(unsigned id, string path)
 {
   dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
-  assert(id == bdev.size());
+  assert(id < bdev.size());
+  assert(bdev[id] == NULL);
   BlockDevice *b = BlockDevice::create(path, NULL, NULL); //aio_cb, this);
   int r = b->open(path);
   if (r < 0) {
@@ -51,15 +106,16 @@ int BlueFS::add_block_device(unsigned id, string path)
   }
   dout(1) << __func__ << " bdev " << id << " path " << path
 	  << " size " << pretty_si_t(b->get_size()) << "B" << dendl;
-  bdev.push_back(b);
-  ioc.push_back(new IOContext(NULL));
-  block_all.resize(bdev.size());
+  bdev[id] = b;
+  ioc[id] = new IOContext(NULL);
   return 0;
 }
 
 uint64_t BlueFS::get_block_device_size(unsigned id)
 {
-  return bdev[id]->get_size();
+  if (bdev[id])
+    return bdev[id]->get_size();
+  return 0;
 }
 
 void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
@@ -68,8 +124,10 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
   dout(1) << __func__ << " bdev " << id << " " << offset << "~" << length
 	  << dendl;
   assert(id < bdev.size());
+  assert(bdev[id]);
   assert(bdev[id]->get_size() >= offset + length);
   block_all[id].insert(offset, length);
+  block_total[id] += length;
 
   if (alloc.size()) {
     log_t.op_alloc_add(id, offset, length);
@@ -77,6 +135,9 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
     assert(r == 0);
     alloc[id]->init_add_free(offset, length);
   }
+
+  if (logger)
+    logger->inc(l_bluefs_gift_bytes, length);
   dout(10) << __func__ << " done" << dendl;
 }
 
@@ -86,6 +147,7 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
   std::lock_guard<std::mutex> l(lock);
   dout(1) << __func__ << " bdev " << id << " want " << want << dendl;
   assert(id < alloc.size());
+  assert(alloc[id]);
   int r = alloc[id]->reserve(want);
   assert(r == 0); // caller shouldn't ask for more than they can get
 
@@ -96,10 +158,13 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
     alloc[id]->unreserve(want - *length);
 
   block_all[id].erase(*offset, *length);
+  block_total[id] -= *length;
   log_t.op_alloc_rm(id, *offset, *length);
   r = _flush_log();
   assert(r == 0);
 
+  if (logger)
+    logger->inc(l_bluefs_reclaim_bytes, *length);
   dout(1) << __func__ << " bdev " << id << " want " << want
 	  << " got " << *offset << "~" << *length << dendl;
   return 0;
@@ -109,12 +174,7 @@ uint64_t BlueFS::get_total(unsigned id)
 {
   std::lock_guard<std::mutex> l(lock);
   assert(id < block_all.size());
-  uint64_t r = 0;
-  interval_set<uint64_t>& p = block_all[id];
-  for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
-    r += q.get_len();
-  }
-  return r;
+  return block_total[id];
 }
 
 uint64_t BlueFS::get_free(unsigned id)
@@ -129,14 +189,14 @@ void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
   std::lock_guard<std::mutex> l(lock);
   usage->resize(bdev.size());
   for (unsigned id = 0; id < bdev.size(); ++id) {
-    uint64_t total = 0;
-    interval_set<uint64_t>& p = block_all[id];
-    for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
-      total += q.get_len();
+    if (!bdev[id]) {
+      (*usage)[id] = make_pair(0, 0);
+      continue;
     }
     (*usage)[id].first = alloc[id]->get_free();
-    (*usage)[id].second = total;
-    uint64_t used = (total - (*usage)[id].first) * 100 / total;
+    (*usage)[id].second = block_total[id];
+    uint64_t used =
+      (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
     dout(10) << __func__ << " bdev " << id
 	     << " free " << (*usage)[id].first
 	     << " (" << pretty_si_t((*usage)[id].first) << "B)"
@@ -162,12 +222,12 @@ int BlueFS::mkfs(uuid_d osd_uuid)
   dout(1) << __func__
 	  << " osd_uuid " << osd_uuid
 	  << dendl;
-  assert(bdev.size() >= 1);
 
   _init_alloc();
+  _init_logger();
 
   super.version = 1;
-  super.block_size = bdev[0]->get_block_size();
+  super.block_size = bdev[BDEV_DB]->get_block_size();
   super.osd_uuid = osd_uuid;
   super.uuid.generate_random();
   dout(1) << __func__ << " uuid " << super.uuid << dendl;
@@ -175,17 +235,20 @@ int BlueFS::mkfs(uuid_d osd_uuid)
   // init log
   FileRef log_file = new File;
   log_file->fnode.ino = 1;
-  log_file->fnode.prefer_bdev = bdev.size() - 1;
-  int r = _allocate(log_file->fnode.prefer_bdev,
-	    g_conf->bluefs_max_log_runway,
-	    &log_file->fnode.extents);
+  log_file->fnode.prefer_bdev = BDEV_WAL;
+  int r = _allocate(
+    log_file->fnode.prefer_bdev,
+    g_conf->bluefs_max_log_runway,
+    &log_file->fnode.extents);
   assert(r == 0);
-  log_writer = new FileWriter(log_file, bdev.size());
+  log_writer = _create_writer(log_file);
 
   // initial txn
   log_t.op_init();
-  for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
+  for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
     interval_set<uint64_t>& p = block_all[bdev];
+    if (p.empty())
+      continue;
     for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
       dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
 	       << "~" << q.get_len() << dendl;
@@ -204,7 +267,9 @@ int BlueFS::mkfs(uuid_d osd_uuid)
   _close_writer(log_writer);
   log_writer = NULL;
   block_all.clear();
+  block_total.clear();
   _stop_alloc();
+  _shutdown_logger();
 
   dout(10) << __func__ << " success" << dendl;
   return 0;
@@ -213,8 +278,10 @@ int BlueFS::mkfs(uuid_d osd_uuid)
 void BlueFS::_init_alloc()
 {
   dout(20) << __func__ << dendl;
-  alloc.resize(bdev.size());
+  alloc.resize(MAX_BDEV);
   for (unsigned id = 0; id < bdev.size(); ++id) {
+    if (!bdev[id])
+      continue;
     alloc[id] = new StupidAllocator;
     interval_set<uint64_t>& p = block_all[id];
     for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
@@ -235,7 +302,6 @@ void BlueFS::_stop_alloc()
 int BlueFS::mount()
 {
   dout(1) << __func__ << dendl;
-  assert(!bdev.empty());
 
   int r = _open_super();
   if (r < 0) {
@@ -244,7 +310,9 @@ int BlueFS::mount()
   }
 
   block_all.clear();
-  block_all.resize(bdev.size());
+  block_all.resize(MAX_BDEV);
+  block_total.clear();
+  block_total.resize(MAX_BDEV, 0);
   _init_alloc();
 
   r = _replay();
@@ -263,10 +331,12 @@ int BlueFS::mount()
   }
 
   // set up the log for future writes
-  log_writer = new FileWriter(_get_file(1), bdev.size());
+  log_writer = _create_writer(_get_file(1));
   assert(log_writer->file->fnode.ino == 1);
   log_writer->pos = log_writer->file->fnode.size;
   dout(10) << __func__ << " log write pos set to " << log_writer->pos << dendl;
+
+  _init_logger();
   return 0;
 
  out:
@@ -283,12 +353,12 @@ void BlueFS::umount()
   _close_writer(log_writer);
   log_writer = NULL;
 
-  block_all.clear();
   _stop_alloc();
   file_map.clear();
   dir_map.clear();
   super = bluefs_super_t();
   log_t.clear();
+  _shutdown_logger();
 }
 
 int BlueFS::fsck()
@@ -311,8 +381,8 @@ int BlueFS::_write_super()
   bl.rebuild();
 
   IOContext ioc(NULL);
-  bdev[0]->aio_write(get_super_offset(), bl, &ioc, false);
-  bdev[0]->aio_submit(&ioc);
+  bdev[BDEV_DB]->aio_write(get_super_offset(), bl, &ioc, false);
+  bdev[BDEV_DB]->aio_submit(&ioc);
   ioc.aio_wait();
   dout(20) << __func__ << " v " << super.version << " crc " << crc
 	   << " offset " << get_super_offset() << dendl;
@@ -328,8 +398,8 @@ int BlueFS::_open_super()
   int r;
 
   // always the second block
-  r = bdev[0]->read(get_super_offset(), get_super_length(),
-		    &bl, ioc[0], false);
+  r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
+			  &bl, ioc[BDEV_DB], false);
   if (r < 0)
     return r;
 
@@ -458,6 +528,7 @@ int BlueFS::_replay()
 	  dout(20) << __func__ << " " << pos << ":  op_alloc_add "
 		   << " " << (int)id << ":" << offset << "~" << length << dendl;
 	  block_all[id].insert(offset, length);
+	  block_total[id] += length;
 	  alloc[id]->init_add_free(offset, length);
 	}
 	break;
@@ -472,6 +543,7 @@ int BlueFS::_replay()
 	  dout(20) << __func__ << " " << pos << ":  op_alloc_rm "
 		   << " " << (int)id << ":" << offset << "~" << length << dendl;
 	  block_all[id].erase(offset, length);
+	  block_total[id] -= length;
 	  alloc[id]->init_rm_free(offset, length);
 	}
 	break;
@@ -823,7 +895,7 @@ void BlueFS::_compact_log()
   t.uuid = super.uuid;
   dout(20) << __func__ << " op_init" << dendl;
   t.op_init();
-  for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
+  for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
     interval_set<uint64_t>& p = block_all[bdev];
     for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
       dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
@@ -868,7 +940,7 @@ void BlueFS::_compact_log()
   _close_writer(log_writer);
 
   log_file->fnode.size = bl.length();
-  log_writer = new FileWriter(log_file, bdev.size());
+  log_writer = _create_writer(log_file);
   log_writer->append(bl);
   int r = _flush(log_writer, true);
   assert(r == 0);
@@ -884,6 +956,8 @@ void BlueFS::_compact_log()
   for (auto& r : old_extents) {
     alloc[r.bdev]->release(r.offset, r.length);
   }
+
+  logger->inc(l_bluefs_log_compactions);
 }
 
 void BlueFS::_pad_bl(bufferlist& bl)
@@ -922,6 +996,8 @@ int BlueFS::_flush_log()
   _pad_bl(bl);
   log_writer->append(bl);
 
+  logger->inc(l_bluefs_logged_bytes, bl.length());
+
   log_t.clear();
   log_t.seq = 0;  // just so debug output is less confusing
 
@@ -941,6 +1017,8 @@ int BlueFS::_flush_log()
     dirty_files.erase(p++);
   }
 
+  _update_logger_stats();
+
   return 0;
 }
 
@@ -1004,7 +1082,9 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
     length += partial;
     dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
     for (auto p : h->iocv) {
-      p->aio_wait();
+      if (p) {
+	p->aio_wait();
+      }
     }
   }
   if (length == partial + h->buffer.length()) {
@@ -1044,8 +1124,9 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
     ++p;
     x_off = 0;
   }
-  for (unsigned i = 0; i < bdev.size(); ++i) {
-    if (h->iocv[i]->has_aios()) {
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i] && h->iocv[i]->has_aios()) {
+      assert(h->iocv[i]);
       bdev[i]->aio_submit(h->iocv[i]);
     }
   }
@@ -1058,7 +1139,9 @@ void BlueFS::_flush_wait(FileWriter *h)
   dout(10) << __func__ << " " << h << dendl;
   utime_t start = ceph_clock_now(NULL);
   for (auto p : h->iocv) {
-    p->aio_wait();
+    if (p) {
+      p->aio_wait();
+    }
   }
   utime_t end = ceph_clock_now(NULL);
   utime_t dur = end - start;
@@ -1135,7 +1218,8 @@ void BlueFS::_flush_bdev()
 {
   dout(20) << __func__ << dendl;
   for (auto p : bdev) {
-    p->flush();
+    if (p)
+      p->flush();
   }
 }
 
@@ -1145,16 +1229,24 @@ int BlueFS::_allocate(unsigned id, uint64_t len, vector<bluefs_extent_t> *ev)
   assert(id < alloc.size());
 
   uint64_t left = ROUND_UP_TO(len, g_conf->bluefs_alloc_size);
-  int r = alloc[id]->reserve(left);
+  int r = -ENOSPC;
+  if (alloc[id]) {
+    r = alloc[id]->reserve(left);
+  }
   if (r < 0) {
-    if (id) {
-      derr << __func__ << " failed to allocate " << left << " on bdev " << id
-	   << ", free " << alloc[id]->get_free()
-	   << "; fallback to bdev 0" << dendl;
-      return _allocate(0, len, ev);
+    if (id != BDEV_SLOW) {
+      if (bdev[id])
+	derr << __func__ << " failed to allocate " << left << " on bdev " << id
+	     << ", free " << alloc[id]->get_free()
+	     << "; fallback to bdev " << id + 1 << dendl;
+      return _allocate(id + 1, len, ev);
     }
-    derr << __func__ << " failed to allocate " << left << " on bdev " << id
-	 << ", free " << alloc[id]->get_free() << dendl;
+    if (bdev[id])
+      derr << __func__ << " failed to allocate " << left << " on bdev " << id
+	   << ", free " << alloc[id]->get_free() << dendl;
+    else
+      derr << __func__ << " failed to allocate " << left << " on bdev " << id
+	   << ", dne" << dendl;
     return r;
   }
 
@@ -1208,11 +1300,15 @@ void BlueFS::sync_metadata()
   dout(10) << __func__ << dendl;
   utime_t start = ceph_clock_now(NULL);
   for (auto p : alloc) {
-    p->commit_start();
+    if (p) {
+      p->commit_start();
+    }
   }
   _flush_log();
   for (auto p : alloc) {
-    p->commit_finish();
+    if (p) {
+      p->commit_finish();
+    }
   }
   _maybe_compact_log();
   utime_t end = ceph_clock_now(NULL);
@@ -1276,41 +1372,53 @@ int BlueFS::open_for_write(
     file->fnode.mtime = ceph_clock_now(NULL);
   }
 
+  file->fnode.prefer_bdev = BlueFS::BDEV_DB;
   if (dirname.length() > 5) {
     // the "db.slow" and "db.wal" directory names are hard-coded at
     // match up with bluestore.  the slow device is always the second
     // one (when a dedicated block.db device is present and used at
     // bdev 0).  the wal device is always last.
     if (strcmp(dirname.c_str() + dirname.length() - 5, ".slow") == 0) {
-      assert(bdev.size() > 1);
-      dout(20) << __func__ << " mapping " << dirname << "/" << filename
-	       << " to bdev 1" << dendl;
-      file->fnode.prefer_bdev = 1;
+      file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
     } else if (strcmp(dirname.c_str() + dirname.length() - 4, ".wal") == 0) {
-      assert(bdev.size() > 1);
-      file->fnode.prefer_bdev = bdev.size() - 1;
-      dout(20) << __func__ << " mapping " << dirname << "/" << filename
-	       << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
+      file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
     }
   }
+  dout(20) << __func__ << " mapping " << dirname << "/" << filename
+	   << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
 
   log_t.op_file_update(file->fnode);
   if (create)
     log_t.op_dir_link(dirname, filename, file->fnode.ino);
 
-  *h = new FileWriter(file, bdev.size());
+  *h = _create_writer(file);
   dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
   return 0;
 }
 
+BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
+{
+  FileWriter *w = new FileWriter(f);
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      w->iocv[i] = new IOContext(NULL);
+    } else {
+      w->iocv[i] = NULL;
+    }
+  }
+  return w;
+}
+
 void BlueFS::_close_writer(FileWriter *h)
 {
   dout(10) << __func__ << " " << h << dendl;
-  for (unsigned i=0; i<bdev.size(); ++i) {
-    h->iocv[i]->aio_wait();
-    bdev[i]->queue_reap_ioc(h->iocv[i]);
+  for (unsigned i=0; i<MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      assert(h->iocv[i]);
+      h->iocv[i]->aio_wait();
+      bdev[i]->queue_reap_ioc(h->iocv[i]);
+    }
   }
-  h->iocv.clear();
   delete h;
 }
 
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index b665bb7..0f9ce7e 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -13,10 +13,34 @@
 #include "boost/intrusive/list.hpp"
 #include <boost/intrusive_ptr.hpp>
 
+class PerfCounters;
+
 class Allocator;
 
+enum {
+  l_bluefs_first = 732600,
+  l_bluefs_gift_bytes,
+  l_bluefs_reclaim_bytes,
+  l_bluefs_db_total_bytes,
+  l_bluefs_db_free_bytes,
+  l_bluefs_wal_total_bytes,
+  l_bluefs_wal_free_bytes,
+  l_bluefs_slow_total_bytes,
+  l_bluefs_slow_free_bytes,
+  l_bluefs_num_files,
+  l_bluefs_log_bytes,
+  l_bluefs_log_compactions,
+  l_bluefs_logged_bytes,
+  l_bluefs_last,
+};
+
 class BlueFS {
 public:
+  static constexpr unsigned MAX_BDEV = 3;
+  static constexpr unsigned BDEV_WAL = 0;
+  static constexpr unsigned BDEV_DB = 1;
+  static constexpr unsigned BDEV_SLOW = 2;
+
   struct File : public RefCountedObject {
     bluefs_fnode_t fnode;
     int refs;
@@ -80,22 +104,17 @@ public:
     bufferlist tail_block;  ///< existing partial block at end of file, if any
 
     std::mutex lock;
-    vector<IOContext*> iocv;  ///< one for each bdev
+    std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
 
-    FileWriter(FileRef f, unsigned num_bdev)
+    FileWriter(FileRef f)
       : file(f),
 	pos(0) {
       ++file->num_writers;
-      iocv.resize(num_bdev);
-      for (unsigned i = 0; i < num_bdev; ++i) {
-	iocv[i] = new IOContext(NULL);
-      }
     }
+    // NOTE: caller must call BlueFS::close_writer()
     ~FileWriter() {
       --file->num_writers;
-      assert(iocv.empty());  // caller must call BlueFS::close_writer()
     }
-
     void append(const char *buf, size_t len) {
       buffer.append(buf, len);
     }
@@ -161,6 +180,8 @@ public:
 private:
   std::mutex lock;
 
+  PerfCounters *logger;
+
   // cache
   map<string, DirRef> dir_map;                    ///< dirname -> Dir
   ceph::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
@@ -173,25 +194,22 @@ private:
   bluefs_transaction_t log_t; ///< pending, unwritten log transaction
 
   /*
-   * - there can be from 1 to 3 block devices.
-   *
-   * - the first device always has the superblock.
-   *
-   * - if there is a dedicated db device, it is the first device, and the
-   *   second device is shared with bluestore.  the first device will be
-   *   db/, and the second device will be db.slow/.
+   * There are up to 3 block devices:
    *
-   * - if there is no dedicated db device, then the first device is shared, and
-   *   maps to the db/ directory.
-   *
-   * - a wal device, if present, it always the last device.  it should be
-   *   used for any files in the db.wal/ directory.
+   *  BDEV_DB   db/      - the primary db device
+   *  BDEV_WAL  db.wal/  - a small, fast device, specifically for the WAL
+   *  BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
    */
   vector<BlockDevice*> bdev;                  ///< block devices we can use
   vector<IOContext*> ioc;                     ///< IOContexts for bdevs
   vector<interval_set<uint64_t> > block_all;  ///< extents in bdev we own
+  vector<uint64_t> block_total;               ///< sum of block_all
   vector<Allocator*> alloc;                   ///< allocators for bdevs
 
+  void _init_logger();
+  void _shutdown_logger();
+  void _update_logger_stats();
+
   void _init_alloc();
   void _stop_alloc();
 
@@ -237,6 +255,7 @@ private:
   int _write_super();
   int _replay(); ///< replay journal
 
+  FileWriter *_create_writer(FileRef f);
   void _close_writer(FileWriter *h);
 
   // always put the super in the second 4k block.  FIXME should this be
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 6f5710c..451262d 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -1145,7 +1145,8 @@ int BlueStore::_open_db(bool create)
     } else if (s == "0") {
       do_bluefs = false;
     } else {
-      derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" << dendl;
+      derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
+	   << dendl;
       return -EIO;
     }
   }
@@ -1162,36 +1163,42 @@ int BlueStore::_open_db(bool create)
 
     char bfn[PATH_MAX];
     struct stat st;
-    int id = 0;
 
     snprintf(bfn, sizeof(bfn), "%s/block.db", path.c_str());
     if (::stat(bfn, &st) == 0) {
-      r = bluefs->add_block_device(id, bfn);
+      r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
       if (r < 0) {
         derr << __func__ << " add block device(" << bfn << ") returned: " 
              << cpp_strerror(r) << dendl;
         goto free_bluefs;
       }
-      r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
+      r = _check_or_set_bdev_label(
+	bfn,
+	bluefs->get_block_device_size(BlueFS::BDEV_DB),
         "bluefs db", create);
       if (r < 0) {
-        derr << __func__ << " check block device(" << bfn << ") label returned: " 
+        derr << __func__
+	     << " check block device(" << bfn << ") label returned: " 
              << cpp_strerror(r) << dendl;
         goto free_bluefs;
       }
       if (create) {
 	bluefs->add_block_extent(
-	  id, BLUEFS_START,
-	  bluefs->get_block_device_size(id) - BLUEFS_START);
+	  BlueFS::BDEV_DB,
+	  BLUEFS_START,
+	  bluefs->get_block_device_size(BlueFS::BDEV_DB) - BLUEFS_START);
       }
-      ++id;
+      bluefs_shared_bdev = BlueFS::BDEV_SLOW;
+    } else {
+      bluefs_shared_bdev = BlueFS::BDEV_DB;
     }
 
+    // shared device
     snprintf(bfn, sizeof(bfn), "%s/block", path.c_str());
-    r = bluefs->add_block_device(id, bfn);
+    r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
     if (r < 0) {
       derr << __func__ << " add block device(" << bfn << ") returned: " 
-           << cpp_strerror(r) << dendl;
+	   << cpp_strerror(r) << dendl;
       goto free_bluefs;
     }
     if (create) {
@@ -1204,21 +1211,23 @@ int BlueStore::_open_db(bool create)
       // align to bluefs's alloc_size
       initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
       initial += g_conf->bluefs_alloc_size - BLUEFS_START;
-      bluefs->add_block_extent(id, BLUEFS_START, initial);
+      bluefs->add_block_extent(bluefs_shared_bdev, BLUEFS_START, initial);
       bluefs_extents.insert(BLUEFS_START, initial);
     }
-    bluefs_shared_bdev = id;
-    ++id;
-    if (id == 2) {
+
+    // use a short, relative path, if it's bluefs.
+    strcpy(fn, "db");
+
+    if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
       // we have both block.db and block; tell rocksdb!
       // note: the second (last) size value doesn't really matter
       char db_paths[PATH_MAX*3];
       snprintf(
-	db_paths, sizeof(db_paths), "%s/db,%lld %s/db.slow,%lld",
-	path.c_str(),
-	(unsigned long long)bluefs->get_block_device_size(0) * 95 / 100,
-	path.c_str(),
-	(unsigned long long)bluefs->get_block_device_size(1) * 95 / 100);
+	db_paths, sizeof(db_paths), "db,%lld db.slow,%lld",
+	(unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_DB) *
+	 95 / 100,
+	(unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_SLOW) *
+	 95 / 100);
       g_conf->set_val("rocksdb_db_paths", db_paths, false, false);
       dout(10) << __func__ << " set rocksdb_db_paths to "
 	       << g_conf->rocksdb_db_paths << dendl;
@@ -1226,23 +1235,26 @@ int BlueStore::_open_db(bool create)
 
     snprintf(bfn, sizeof(bfn), "%s/block.wal", path.c_str());
     if (::stat(bfn, &st) == 0) {
-      r = bluefs->add_block_device(id, bfn);
+      r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
       if (r < 0) {
         derr << __func__ << " add block device(" << bfn << ") returned: " 
 	     << cpp_strerror(r) << dendl;
         goto free_bluefs;			
       }
-      r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
+      r = _check_or_set_bdev_label(
+	bfn,
+	bluefs->get_block_device_size(BlueFS::BDEV_WAL),
         "bluefs wal", create);
       if (r < 0) {
-        derr << __func__ << " check block device(" << bfn << ") label returned: " 
+        derr << __func__ << " check block device(" << bfn << ") label returned: "
 	     << cpp_strerror(r) << dendl;
         goto free_bluefs;
       }
       if (create) {
 	bluefs->add_block_extent(
-	  id, BDEV_LABEL_BLOCK_SIZE,
-	  bluefs->get_block_device_size(id) - BDEV_LABEL_BLOCK_SIZE);
+	  BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
+	  bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
+	   BDEV_LABEL_BLOCK_SIZE);
       }
       g_conf->set_val("rocksdb_separate_wal_dir", "true");
     } else {
@@ -1320,7 +1332,8 @@ int BlueStore::_open_db(bool create)
       delete bluefs;
       bluefs = NULL;
     }
-    // delete env manually here since we can't depend on db to do this under this case
+    // delete env manually here since we can't depend on db to do this
+    // under this case
     delete env;
     env = NULL;
     return -EIO;
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index 828dd91..91aec62 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -667,8 +667,8 @@ public:
 
   int fsck() override;
 
-  unsigned get_max_object_name_length() override {
-    return 4096;
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
   }
   unsigned get_max_attr_name_length() override {
     return 256;  // arbitrary; there is no real limit internally
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
index 97b952e..95a4f5f 100644
--- a/src/os/filestore/FileStore.cc
+++ b/src/os/filestore/FileStore.cc
@@ -123,6 +123,12 @@ static CompatSet get_fs_supported_compat_set() {
   return compat;
 }
 
+int FileStore::validate_hobject_key(const hobject_t &obj) const
+{
+  unsigned len = LFNIndex::get_max_escaped_name_len(obj);
+  return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0;
+}
+
 int FileStore::get_block_device_fsid(const string& path, uuid_d *fsid)
 {
   // make sure we don't try to use aio or direct_io (and get annoying
@@ -300,8 +306,9 @@ int FileStore::lfn_open(const coll_t& cid,
           << ") in index: " << cpp_strerror(-r) << dendl;
       goto fail;
     }
-    r = chain_fsetxattr(fd, XATTR_SPILL_OUT_NAME,
-                        XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT), true);
+    r = chain_fsetxattr<true, true>(
+      fd, XATTR_SPILL_OUT_NAME,
+      XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT));
     if (r < 0) {
       VOID_TEMP_FAILURE_RETRY(::close(fd));
       derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
@@ -559,7 +566,8 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
   m_filestore_max_alloc_hint_size(g_conf->filestore_max_alloc_hint_size),
   m_fs_type(0),
   m_filestore_max_inline_xattr_size(0),
-  m_filestore_max_inline_xattrs(0)
+  m_filestore_max_inline_xattrs(0),
+  m_filestore_max_xattr_value_size(0)
 {
   m_filestore_kill_at.set(g_conf->filestore_kill_at);
   for (int i = 0; i < m_ondisk_finisher_num; ++i) {
@@ -2146,7 +2154,8 @@ void FileStore::_set_global_replay_guard(const coll_t& cid,
   // then record that we did it
   bufferlist v;
   ::encode(spos, v);
-  int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
+  int r = chain_fsetxattr<true, true>(
+    fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length());
   if (r < 0) {
     derr << __func__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
 	 << " got " << cpp_strerror(r) << dendl;
@@ -2236,7 +2245,8 @@ void FileStore::_set_replay_guard(int fd,
   bufferlist v(40);
   ::encode(spos, v);
   ::encode(in_progress, v);
-  int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
+  int r = chain_fsetxattr<true, true>(
+    fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
   if (r < 0) {
     derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
     assert(0 == "fsetxattr failed");
@@ -2279,7 +2289,8 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
   ::encode(spos, v);
   bool in_progress = false;
   ::encode(in_progress, v);
-  int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
+  int r = chain_fsetxattr<true, true>(
+    fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
   if (r < 0) {
     derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
     assert(0 == "fsetxattr failed");
@@ -3391,11 +3402,11 @@ int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobjec
 
     r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
     if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
-      r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
-                          sizeof(XATTR_NO_SPILL_OUT), true);
+      r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+                          sizeof(XATTR_NO_SPILL_OUT));
     } else {
-      r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
-                          sizeof(XATTR_SPILL_OUT), true);
+      r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+                          sizeof(XATTR_SPILL_OUT));
     }
     if (r < 0)
       goto out3;
@@ -5663,21 +5674,25 @@ void FileStore::set_xattr_limits_via_conf()
 {
   uint32_t fs_xattr_size;
   uint32_t fs_xattrs;
+  uint32_t fs_xattr_max_value_size;
 
   switch (m_fs_type) {
 #if defined(__linux__)
   case XFS_SUPER_MAGIC:
     fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
     fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+    fs_xattr_max_value_size = g_conf->filestore_max_xattr_value_size_xfs;
     break;
   case BTRFS_SUPER_MAGIC:
     fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
     fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+    fs_xattr_max_value_size = g_conf->filestore_max_xattr_value_size_btrfs;
     break;
 #endif
   default:
     fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
     fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+    fs_xattr_max_value_size = g_conf->filestore_max_xattr_value_size_other;
     break;
   }
 
@@ -5692,6 +5707,24 @@ void FileStore::set_xattr_limits_via_conf()
     m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
   else
     m_filestore_max_inline_xattrs = fs_xattrs;
+
+  // Use override value if set
+  if (g_conf->filestore_max_xattr_value_size)
+    m_filestore_max_xattr_value_size = g_conf->filestore_max_xattr_value_size;
+  else
+    m_filestore_max_xattr_value_size = fs_xattr_max_value_size;
+
+  if (m_filestore_max_xattr_value_size < g_conf->osd_max_object_name_len) {
+    derr << "WARNING: max attr value size ("
+	 << m_filestore_max_xattr_value_size
+	 << ") is smaller than osd_max_object_name_len ("
+	 << g_conf->osd_max_object_name_len
+	 << ").  Your backend filesystem appears to not support attrs large "
+	 << "enough to handle the configured max rados name size.  You may get "
+	 << "unexpected ENAMETOOLONG errors on rados operations or buggy "
+	 << "behavior"
+	 << dendl;
+  }
 }
 
 // -- FSSuperblock --
diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h
index d81f8b0..a5cd75d 100644
--- a/src/os/filestore/FileStore.h
+++ b/src/os/filestore/FileStore.h
@@ -432,10 +432,9 @@ public:
   int write_op_seq(int, uint64_t seq);
   int mount();
   int umount();
-  unsigned get_max_object_name_length() {
-    // not safe for all file systems, btw!  use the tunable to limit this.
-    return 4096;
-  }
+
+  int validate_hobject_key(const hobject_t &obj) const override;
+
   unsigned get_max_attr_name_length() {
     // xattr limit is 128; leave room for our prefixes (user.ceph._),
     // some margin, and cap at 100
@@ -739,6 +738,7 @@ private:
   void set_xattr_limits_via_conf();
   uint32_t m_filestore_max_inline_xattr_size;
   uint32_t m_filestore_max_inline_xattrs;
+  uint32_t m_filestore_max_xattr_value_size;
 
   FSSuperblock superblock;
 
diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc
index 3a3e5c9..078550d 100644
--- a/src/os/filestore/IndexManager.cc
+++ b/src/os/filestore/IndexManager.cc
@@ -36,8 +36,9 @@
 static int set_version(const char *path, uint32_t version) {
   bufferlist bl;
   ::encode(version, bl);
-  return chain_setxattr(path, "user.cephos.collection_version", bl.c_str(),
-		     bl.length(), true);
+  return chain_setxattr<true, true>(
+    path, "user.cephos.collection_version", bl.c_str(),
+    bl.length());
 }
 
 static int get_version(const char *path, uint32_t *version) {
diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc
index 47436ea..1994d5a 100644
--- a/src/os/filestore/LFNIndex.cc
+++ b/src/os/filestore/LFNIndex.cc
@@ -74,6 +74,14 @@ struct FDCloser {
 
 /* Public methods */
 
+uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj)
+{
+  ghobject_t ghobj(obj);
+  ghobj.shard_id = shard_id_t(0);
+  ghobj.generation = 0;
+  ghobj.hobj.snap = 0;
+  return lfn_generate_object_name_current(ghobj).size();
+}
 
 int LFNIndex::init()
 {
@@ -375,14 +383,18 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
 				  ghobject_t *o)
 {
   char path[PATH_MAX];
-  bufferptr bp(PATH_MAX);
   snprintf(path, sizeof(path), "%s/%s", dir, file);
   // Hack, user.ceph._ is the attribute used to store the object info
-  int r = chain_getxattr(path, "user.ceph._", bp.c_str(), bp.length());
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    path,
+    "user.ceph._",
+    &bp);
   if (r < 0)
     return r;
   bufferlist bl;
-  bl.push_back(bp);
+  if (r > 0)
+    bl.push_back(bp);
   object_info_t oi(bl);
   *o = ghobject_t(oi.soid);
   return 0;
@@ -421,10 +433,11 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
     ghobject_t obj;
     if (lfn_is_object(short_name)) {
       r = lfn_translate(to_list, short_name, &obj);
-      if (r < 0) {
-	r = -errno;
+      if (r == -EINVAL) {
+	continue;
+      } else if (r < 0) {
 	goto cleanup;
-      } else if (r > 0) {
+      } else {
 	string long_name = lfn_generate_object_name(obj);
 	if (!lfn_must_hash(long_name)) {
 	  assert(long_name == short_name);
@@ -434,8 +447,6 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
 
 	out->insert(pair<string, ghobject_t>(short_name, obj));
 	++listed;
-      } else {
-	continue;
       }
     }
   }
@@ -521,9 +532,10 @@ int LFNIndex::add_attr_path(const vector<string> &path,
 {
   string full_path = get_full_path_subdir(path);
   maybe_inject_failure();
-  return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
-		     reinterpret_cast<void *>(attr_value.c_str()),
-		     attr_value.length());
+  return chain_setxattr<false, true>(
+    full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+    reinterpret_cast<void *>(attr_value.c_str()),
+    attr_value.length());
 }
 
 int LFNIndex::get_attr_path(const vector<string> &path,
@@ -531,26 +543,14 @@ int LFNIndex::get_attr_path(const vector<string> &path,
 			    bufferlist &attr_value)
 {
   string full_path = get_full_path_subdir(path);
-  size_t size = 1024; // Initial
-  while (1) {
-    bufferptr buf(size);
-    int r = chain_getxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
-			 reinterpret_cast<void *>(buf.c_str()),
-			 size);
-    if (r > 0) {
-      buf.set_length(r);
-      attr_value.push_back(buf);
-      break;
-    } else {
-      r = -errno;
-      if (r == -ERANGE) {
-	size *= 2;
-      } else {
-	return r;
-      }
-    }
-  }
-  return 0;
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    mangle_attr_name(attr_name).c_str(),
+    &bp);
+  if (r > 0)
+    attr_value.push_back(bp);
+  return r;
 }
 
 int LFNIndex::remove_attr_path(const vector<string> &path,
@@ -621,13 +621,8 @@ static void append_escaped(string::const_iterator begin,
   }
 }
 
-string LFNIndex::lfn_generate_object_name(const ghobject_t &oid)
+string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid)
 {
-  if (index_version == HASH_INDEX_TAG)
-    return lfn_generate_object_name_keyless(oid);
-  if (index_version == HASH_INDEX_TAG_2)
-    return lfn_generate_object_name_poolless(oid);
-
   string full_name;
   string::const_iterator i = oid.hobj.oid.name.begin();
   if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
@@ -754,12 +749,14 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
   int i = 0;
   string candidate;
   string candidate_path;
-  char buf[FILENAME_MAX_LEN + 1];
   for ( ; ; ++i) {
     candidate = lfn_get_short_name(oid, i);
     candidate_path = get_full_path(path, candidate);
-    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
-		       buf, sizeof(buf));
+    bufferptr bp;
+    r = chain_getxattr_buf(
+      candidate_path.c_str(),
+      get_lfn_attr().c_str(),
+      &bp);
     if (r < 0) {
       if (errno != ENODATA && errno != ENOENT)
 	return -errno;
@@ -780,8 +777,8 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
       return 0;
     }
     assert(r > 0);
-    buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
-    if (!strcmp(buf, full_name.c_str())) {
+    string lfn(bp.c_str(), bp.length());
+    if (lfn == full_name) {
       if (mangled_name)
 	*mangled_name = candidate;
       if (out_path)
@@ -793,8 +790,11 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
       }
       return 0;
     }
-    r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
-		       buf, sizeof(buf));
+    bp = bufferptr();
+    r = chain_getxattr_buf(
+      candidate_path.c_str(),
+      get_alt_lfn_attr().c_str(),
+      &bp);
     if (r > 0) {
       // only consider alt name if nlink > 1
       struct stat st;
@@ -805,7 +805,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
 	// left over from incomplete unlink, remove
 	maybe_inject_failure();
 	dout(20) << __func__ << " found extra alt attr for " << candidate_path
-		 << ", long name " << string(buf, r) << dendl;
+		 << ", long name " << string(bp.c_str(), bp.length()) << dendl;
 	rc = chain_removexattr(candidate_path.c_str(),
 			       get_alt_lfn_attr().c_str());
 	maybe_inject_failure();
@@ -813,8 +813,8 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
 	  return rc;
 	continue;
       }
-      buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
-      if (!strcmp(buf, full_name.c_str())) {
+      string lfn(bp.c_str(), bp.length());
+      if (lfn == full_name) {
 	dout(20) << __func__ << " used alt attr for " << full_name << dendl;
 	if (mangled_name)
 	  *mangled_name = candidate;
@@ -841,23 +841,29 @@ int LFNIndex::lfn_created(const vector<string> &path,
   maybe_inject_failure();
 
   // if the main attr exists and is different, move it to the alt attr.
-  char buf[FILENAME_MAX_LEN + 1];
-  int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
-			 buf, sizeof(buf));
-  if (r >= 0 && (r != (int)full_name.length() ||
-		 memcmp(buf, full_name.c_str(), full_name.length()))) {
-    dout(20) << __func__ << " " << mangled_name
-	     << " moving old name to alt attr "
-	     << string(buf, r)
-	     << ", new name is " << full_name << dendl;
-    r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
-		       buf, r);
-    if (r < 0)
-      return r;
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_lfn_attr().c_str(),
+    &bp);
+  if (r > 0) {
+    string lfn(bp.c_str(), bp.length());
+    if (lfn != full_name) {
+      dout(20) << __func__ << " " << mangled_name
+	       << " moving old name to alt attr "
+	       << lfn
+	       << ", new name is " << full_name << dendl;
+      r = chain_setxattr<false, true>(
+	full_path.c_str(), get_alt_lfn_attr().c_str(),
+	bp.c_str(), bp.length());
+      if (r < 0)
+	return r;
+    }
   }
 
-  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
-		     full_name.c_str(), full_name.size());
+  return chain_setxattr<false, true>(
+    full_path.c_str(), get_lfn_attr().c_str(),
+    full_name.c_str(), full_name.size());
 }
 
 int LFNIndex::lfn_unlink(const vector<string> &path,
@@ -936,31 +942,32 @@ int LFNIndex::lfn_translate(const vector<string> &path,
     return lfn_parse_object_name(short_name, out);
   }
   string full_path = get_full_path(path, short_name);
-  char attr[PATH_MAX];
   // First, check alt attr
-  int r = chain_getxattr(
+  bufferptr bp;
+  int r = chain_getxattr_buf(
     full_path.c_str(),
     get_alt_lfn_attr().c_str(),
-    attr,
-    sizeof(attr) - 1);
-  if (r >= 0) {
+    &bp);
+  if (r > 0) {
     // There is an alt attr, does it match?
-    if (r < (int)sizeof(attr))
-      attr[r] = '\0';
-    if (short_name_matches(short_name.c_str(), attr)) {
-      string long_name(attr);
-      return lfn_parse_object_name(long_name, out);
+    string lfn(bp.c_str(), bp.length());
+    if (short_name_matches(short_name.c_str(), lfn.c_str())) {
+      return lfn_parse_object_name(lfn, out);
     }
   }
 
   // Get lfn_attr
-  r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1);
+  bp = bufferptr();
+  r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_lfn_attr().c_str(),
+    &bp);
   if (r < 0)
-    return -errno;
-  if (r < (int)sizeof(attr))
-    attr[r] = '\0';
+    return r;
+  if (r == 0)
+    return -EINVAL;
 
-  string long_name(attr);
+  string long_name(bp.c_str(), bp.length());
   return lfn_parse_object_name(long_name, out);
 }
 
@@ -1032,7 +1039,7 @@ static int parse_object(const char *s, ghobject_t& o)
   return 0;
 }
 
-bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
+int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
 {
   bool r = parse_object(long_name.c_str(), *out);
   int64_t pool = -1;
@@ -1042,7 +1049,7 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t
   out->hobj.pool = pool;
   if (!r) return r;
   string temp = lfn_generate_object_name(*out);
-  return r;
+  return r ? 0 : -EINVAL;
 }
 
 static bool append_unescaped(string::const_iterator begin,
@@ -1069,8 +1076,8 @@ static bool append_unescaped(string::const_iterator begin,
   return true;
 }
 
-bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
-					      ghobject_t *out)
+int LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
+					     ghobject_t *out)
 {
   string name;
   string key;
@@ -1081,7 +1088,7 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
   if (*current == '\\') {
     ++current;
     if (current == long_name.end()) {
-      return false;
+      return -EINVAL;
     } else if (*current == 'd') {
       name.append("DIR_");
       ++current;
@@ -1096,27 +1103,27 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
   string::const_iterator end = current;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   if (!append_unescaped(current, end, &name))
-    return false;
+    return -EINVAL;
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   if (!append_unescaped(current, end, &key))
-    return false;
+    return -EINVAL;
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   string snap_str(current, end);
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end != long_name.end())
-    return false;
+    return -EINVAL;
   string hash_str(current, end);
 
   if (snap_str == "head")
@@ -1133,11 +1140,11 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
   if (coll().is_pg_prefix(&pg))
     pool = (int64_t)pg.pgid.pool();
   (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
-  return true;
+  return 0;
 }
 
 
-bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
+int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
 {
   string name;
   string key;
@@ -1157,7 +1164,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
   if (*current == '\\') {
     ++current;
     if (current == long_name.end()) {
-      return false;
+      return -EINVAL;
     } else if (*current == 'd') {
       name.append("DIR_");
       ++current;
@@ -1172,35 +1179,35 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
   string::const_iterator end = current;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   if (!append_unescaped(current, end, &name))
-    return false;
+    return -EINVAL;
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   if (!append_unescaped(current, end, &key))
-    return false;
+    return -EINVAL;
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   string snap_str(current, end);
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   string hash_str(current, end);
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
   if (end == long_name.end())
-    return false;
+    return -EINVAL;
   if (!append_unescaped(current, end, &ns))
-    return false;
+    return -EINVAL;
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
@@ -1212,7 +1219,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
     current = ++end;
     for ( ; end != long_name.end() && *end != '_'; ++end) ;
     if (end == long_name.end())
-      return false;
+      return -EINVAL;
     genstring = string(current, end);
 
     generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
@@ -1220,7 +1227,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
     current = ++end;
     for ( ; end != long_name.end() && *end != '_'; ++end) ;
     if (end != long_name.end())
-      return false;
+      return -EINVAL;
     shardstring = string(current, end);
 
     shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
@@ -1240,7 +1247,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
     pool = strtoull(pstring.c_str(), NULL, 16);
 
   (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
-  return true;
+  return 0;
 }
 
 bool LFNIndex::lfn_is_hashed_filename(const string &name)
diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h
index 1cf4f0b..4efc313 100644
--- a/src/os/filestore/LFNIndex.h
+++ b/src/os/filestore/LFNIndex.h
@@ -212,6 +212,11 @@ public:
       );
   }
 
+  /**
+   * Returns the length of the longest escaped name which could result
+   * from any clone, shard, or rollback object of this object
+   */
+  static uint64_t get_max_escaped_name_len(const hobject_t &obj);
 
 protected:
   virtual int _init() = 0;
@@ -480,24 +485,36 @@ private:
     ); ///< @return Generated object name.
 
   /// Generate object name
-  string lfn_generate_object_name(
+  static string lfn_generate_object_name_current(
     const ghobject_t &oid ///< [in] Object for which to generate.
     ); ///< @return Generated object name.
 
+  /// Generate object name
+  string lfn_generate_object_name(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ) {
+    if (index_version == HASH_INDEX_TAG)
+      return lfn_generate_object_name_keyless(oid);
+    if (index_version == HASH_INDEX_TAG_2)
+      return lfn_generate_object_name_poolless(oid);
+    else
+      return lfn_generate_object_name_current(oid);
+  } ///< @return Generated object name.
+
   /// Parse object name
-  bool lfn_parse_object_name_keyless(
+  int lfn_parse_object_name_keyless(
     const string &long_name, ///< [in] Name to parse
     ghobject_t *out	     ///< [out] Resulting Object
     ); ///< @return True if successfull, False otherwise.
 
   /// Parse object name
-  bool lfn_parse_object_name_poolless(
+  int lfn_parse_object_name_poolless(
     const string &long_name, ///< [in] Name to parse
     ghobject_t *out	     ///< [out] Resulting Object
     ); ///< @return True if successfull, False otherwise.
 
   /// Parse object name
-  bool lfn_parse_object_name(
+  int lfn_parse_object_name(
     const string &long_name, ///< [in] Name to parse
     ghobject_t *out	     ///< [out] Resulting Object
     ); ///< @return True if successfull, False otherwise.
diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc
index 28bb87b..0461c19 100644
--- a/src/os/filestore/chain_xattr.cc
+++ b/src/os/filestore/chain_xattr.cc
@@ -37,7 +37,7 @@
  * where <id> marks the num of xattr in the chain.
  */
 
-static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
 {
   int pos = 0;
 
@@ -135,7 +135,7 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
     return getxattr_len(fn, name);
 
   do {
-    chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
+    chunk_size = size;
     get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
 
     r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
@@ -173,6 +173,35 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
   return ret;
 }
 
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp)
+{
+  size_t size = 1024; // Initial
+  while (1) {
+    bufferptr buf(size);
+    int r = chain_getxattr(
+      fn,
+      name,
+      buf.c_str(),
+      size);
+    if (r > 0) {
+      buf.set_length(r);
+      if (bp)
+	bp->swap(buf);
+      return r;
+    } else if (r == 0) {
+      return 0;
+    } else {
+      if (r == -ERANGE) {
+	size *= 2;
+      } else {
+	return r;
+      }
+    }
+  }
+  assert(0 == "unreachable");
+  return 0;
+}
+
 static int chain_fgetxattr_len(int fd, const char *name)
 {
   int i = 0, total = 0;
@@ -206,7 +235,7 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
     return chain_fgetxattr_len(fd, name);
 
   do {
-    chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
+    chunk_size = size;
     get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
 
     r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
@@ -247,7 +276,7 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
 
 // setxattr
 
-static int get_xattr_block_size(size_t size)
+int get_xattr_block_size(size_t size)
 {
   if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
     // this may fit in the inode; stripe over short attrs so that XFS
@@ -256,79 +285,6 @@ static int get_xattr_block_size(size_t size)
   return CHAIN_XATTR_MAX_BLOCK_LEN;
 }
 
-int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk)
-{
-  int i = 0, pos = 0;
-  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
-  int ret = 0;
-  size_t max_chunk_size = get_xattr_block_size(size);
-
-  do {
-    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
-    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-    size -= chunk_size;
-
-    int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
-    if (r < 0) {
-      ret = r;
-      break;
-    }
-    pos  += chunk_size;
-    ret = pos;
-    i++;
-  } while (size);
-
-  if (ret >= 0 && !onechunk) {
-    int r;
-    do {
-      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-      r = sys_removexattr(fn, raw_name);
-      if (r < 0 && r != -ENODATA)
-	ret = r;
-      i++;
-    } while (r != -ENODATA);
-  }
-
-  return ret;
-}
-
-int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk)
-{
-  int i = 0, pos = 0;
-  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
-  int ret = 0;
-  size_t max_chunk_size = get_xattr_block_size(size);
-
-  do {
-    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
-    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-    size -= chunk_size;
-
-    int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
-    if (r < 0) {
-      ret = r;
-      break;
-    }
-    pos  += chunk_size;
-    ret = pos;
-    i++;
-  } while (size);
-
-  if (ret >= 0 && !onechunk) {
-    int r;
-    do {
-      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-      r = sys_fremovexattr(fd, raw_name);
-      if (r < 0 && r != -ENODATA)
-	ret = r;
-      i++;
-    } while (r != -ENODATA);
-  }
-
-  return ret;
-}
-
-
 // removexattr
 
 int chain_removexattr(const char *fn, const char *name)
diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h
index 6ee8050..54a8568 100644
--- a/src/os/filestore/chain_xattr.h
+++ b/src/os/filestore/chain_xattr.h
@@ -5,6 +5,10 @@
 #define __CEPH_OSD_CHAIN_XATTR_H
 
 #include "common/xattr.h"
+#include "include/assert.h"
+#include "include/buffer.h"
+#include <string.h>
+#include <stdio.h>
 
 #include <errno.h>
 
@@ -77,9 +81,100 @@ static inline int sys_fremovexattr(int fd, const char *name)
 // wrappers to chain large values across multiple xattrs
 
 int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp);
 int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
-int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk=false);
-int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk=false);
+
+int get_xattr_block_size(size_t size);
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len);
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_setxattr(
+  const char *fn, const char *name, const void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  size_t max_chunk_size =
+    ensure_single_attr ? size : get_xattr_block_size(size);
+
+  static_assert(
+    !skip_chain_cleanup || ensure_single_attr,
+    "skip_chain_cleanup must imply ensure_single_attr");
+
+  do {
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    size -= chunk_size;
+
+    int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+    pos  += chunk_size;
+    ret = pos;
+    i++;
+    assert(size == 0 || !ensure_single_attr);
+  } while (size);
+
+  if (ret >= 0 && !skip_chain_cleanup) {
+    int r;
+    do {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_removexattr(fn, raw_name);
+      if (r < 0 && r != -ENODATA)
+	ret = r;
+      i++;
+    } while (r != -ENODATA);
+  }
+
+  return ret;
+}
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_fsetxattr(
+  int fd, const char *name, const void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  size_t max_chunk_size =
+    ensure_single_attr ? size : get_xattr_block_size(size);
+
+  static_assert(
+    !skip_chain_cleanup || ensure_single_attr,
+    "skip_chain_cleanup must imply ensure_single_attr");
+
+  do {
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    size -= chunk_size;
+
+    int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+    pos  += chunk_size;
+    ret = pos;
+    i++;
+    assert(size == 0 || !ensure_single_attr);
+  } while (size);
+
+  if (ret >= 0 && !skip_chain_cleanup) {
+    int r;
+    do {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_fremovexattr(fd, raw_name);
+      if (r < 0 && r != -ENODATA)
+	ret = r;
+      i++;
+    } while (r != -ENODATA);
+  }
+
+  return ret;
+}
+
 int chain_listxattr(const char *fn, char *names, size_t len);
 int chain_flistxattr(int fd, char *names, size_t len);
 int chain_removexattr(const char *fn, const char *name);
diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h
index 4b00663..09483de 100644
--- a/src/os/kstore/KStore.h
+++ b/src/os/kstore/KStore.h
@@ -411,8 +411,9 @@ public:
 
   int fsck();
 
-  unsigned get_max_object_name_length() {
-    return 4096;
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
   }
   unsigned get_max_attr_name_length() {
     return 256;  // arbitrary; there is no real limit internally
diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h
index 2d809f3..64f9afc 100644
--- a/src/os/memstore/MemStore.h
+++ b/src/os/memstore/MemStore.h
@@ -365,8 +365,8 @@ public:
   int mount();
   int umount();
 
-  unsigned get_max_object_name_length() {
-    return 4096;
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
   }
   unsigned get_max_attr_name_length() {
     return 256;  // arbitrary; there is no real limit internally
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 1ed664f..7c260ff 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -4615,11 +4615,7 @@ void OSD::ms_handle_connect(Connection *con)
 
     // full map requests may happen while active or pre-boot
     if (requested_full_first) {
-      epoch_t first = requested_full_first;
-      epoch_t last = requested_full_last;
-      requested_full_first = 0;
-      requested_full_last = 0;
-      request_full_map(first, last);
+      rerequest_full_maps();
     }
   }
 }
@@ -4918,17 +4914,6 @@ void OSD::request_full_map(epoch_t first, epoch_t last)
   monc->send_mon_message(req);
 }
 
-void OSD::finish_full_map_request()
-{
-  if (requested_full_first == 0 && requested_full_last == 0)
-    return;
-  //Had requested some map but didn't receive in this message,
-  //This might because monitor capping the message to osd_map_message_max
-  dout(10) << __func__ << "still missing " << requested_full_first
-	   << ".." << requested_full_last << ", but now give up." << dendl;
-  requested_full_first = requested_full_last = 0;
-}
-
 void OSD::got_full_map(epoch_t e)
 {
   assert(requested_full_first <= requested_full_last);
@@ -6693,8 +6678,11 @@ void OSD::handle_osd_map(MOSDMap *m)
   // even if this map isn't from a mon, we may have satisfied our subscription
   monc->sub_got("osdmap", last);
 
-  if (!m->maps.empty())
-    finish_full_map_request();
+  if (!m->maps.empty() && requested_full_first) {
+    dout(10) << __func__ << " still missing full maps " << requested_full_first
+	     << ".." << requested_full_last << dendl;
+    rerequest_full_maps();
+  }
 
   if (last <= superblock.newest_map) {
     dout(10) << " no new maps here, dropping" << dendl;
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 9c4d68c..8a62ebb 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -2050,7 +2050,13 @@ protected:
   epoch_t requested_full_first, requested_full_last;
 
   void request_full_map(epoch_t first, epoch_t last);
-  void finish_full_map_request();
+  void rerequest_full_maps() {
+    epoch_t first = requested_full_first;
+    epoch_t last = requested_full_last;
+    requested_full_first = 0;
+    requested_full_last = 0;
+    request_full_map(first, last);
+  }
   void got_full_map(epoch_t e);
 
   // -- failures --
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 2804c73..58d2cc5 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2298,6 +2298,7 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
 
   // Info
   child->info.history = info.history;
+  child->info.history.epoch_created = get_osdmap()->get_epoch();
   child->info.purged_snaps = info.purged_snaps;
 
   if (info.last_backfill.is_max()) {
@@ -2313,6 +2314,7 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
   }
 
   child->info.stats = info.stats;
+  child->info.stats.parent_split_bits = split_bits;
   info.stats.stats_invalid = true;
   child->info.stats.stats_invalid = true;
   child->info.last_epoch_started = info.last_epoch_started;
@@ -3357,8 +3359,16 @@ bool PG::sched_scrub()
 
     //NOSCRUB so skip regular scrubs
     if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
-	 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep)
+	 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
+      if (scrubber.reserved) {
+        // cancel scrub if it is still in scheduling,
+        // so pgs from other pools where scrub are still legal
+        // have a chance to go ahead with scrubbing.
+        clear_scrub_reserved();
+        scrub_unreserve_replicas();
+      }
       return false;
+    }
   }
 
   if (cct->_conf->osd_scrub_auto_repair
@@ -4108,6 +4118,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 
           bool boundary_found = false;
           hobject_t start = scrubber.start;
+          unsigned loop = 0;
           while (!boundary_found) {
             vector<hobject_t> objects;
             ret = get_pgbackend()->objects_list_partial(
@@ -4137,6 +4148,12 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
                 boundary_found = true;
               }
             }
+
+            // reset handle once in a while, the search maybe takes long.
+            if (++loop >= g_conf->osd_loop_before_reset_tphandle) {
+              handle.reset_tp_timeout();
+              loop = 0;
+            }
           }
 
 	  if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 3958f89..052d6c7 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1627,16 +1627,41 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return;
   }
 
+  hobject_t head(m->get_oid(), m->get_object_locator().key,
+		 CEPH_NOSNAP, m->get_pg().ps(),
+		 info.pgid.pool(), m->get_object_locator().nspace);
+
   // object name too long?
-  unsigned max_name_len = MIN(g_conf->osd_max_object_name_len,
-                              osd->osd->store->get_max_object_name_length());
-  if (m->get_oid().name.size() > max_name_len) {
-    dout(4) << "do_op '" << m->get_oid().name << "' is longer than "
-            << max_name_len << " bytes" << dendl;
+  if (m->get_oid().name.size() > g_conf->osd_max_object_name_len) {
+    dout(4) << "do_op name is longer than "
+            << g_conf->osd_max_object_name_len
+	    << " bytes" << dendl;
+    osd->reply_op_error(op, -ENAMETOOLONG);
+    return;
+  }
+  if (m->get_object_locator().key.size() > g_conf->osd_max_object_name_len) {
+    dout(4) << "do_op locator is longer than "
+            << g_conf->osd_max_object_name_len
+	    << " bytes" << dendl;
+    osd->reply_op_error(op, -ENAMETOOLONG);
+    return;
+  }
+  if (m->get_object_locator().nspace.size() >
+      g_conf->osd_max_object_namespace_len) {
+    dout(4) << "do_op namespace is longer than "
+            << g_conf->osd_max_object_namespace_len
+	    << " bytes" << dendl;
     osd->reply_op_error(op, -ENAMETOOLONG);
     return;
   }
 
+  if (int r = osd->store->validate_hobject_key(head)) {
+    dout(4) << "do_op object " << head << " invalid for backing store: "
+	    << r << dendl;
+    osd->reply_op_error(op, r);
+    return;
+  }
+
   // blacklisted?
   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
@@ -1702,11 +1727,6 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 	   << " flags " << ceph_osd_flag_string(m->get_flags())
 	   << dendl;
 
-  hobject_t head(m->get_oid(), m->get_object_locator().key,
-		 CEPH_NOSNAP, m->get_pg().ps(),
-		 info.pgid.pool(), m->get_object_locator().nspace);
-
-
   if (write_ordered &&
       scrubber.write_blocked_by_scrub(head, get_sort_bitwise())) {
     dout(20) << __func__ << ": waiting for scrub" << dendl;
@@ -6199,6 +6219,15 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
 	obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
       else
 	obs.oi.clear_omap_digest();
+
+      if (rollback_to->obs.oi.is_omap()) {
+	dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
+	obs.oi.set_flag(object_info_t::FLAG_OMAP);
+      } else {
+	dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
+	obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+      }
+
       snapset.head_exists = true;
     }
   }
@@ -6919,10 +6948,6 @@ int ReplicatedPG::fill_in_copy_get(
     return result;
   }
 
-  if ((osd_op.op.copy_get.flags & CEPH_OSD_COPY_GET_FLAG_NOTSUPP_OMAP) &&
-      oi.is_omap())
-      return -EOPNOTSUPP;
-
   MOSDOp *op = reinterpret_cast<MOSDOp*>(ctx->op->get_req());
   uint64_t features = op->get_features();
 
@@ -7162,12 +7187,7 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
     // it already!
     assert(cop->cursor.is_initial());
   }
-
-  uint32_t copyget_flags = 0;
-  if (!pool.info.supports_omap())
-   copyget_flags |= CEPH_OSD_COPY_GET_FLAG_NOTSUPP_OMAP;
-
-  op.copy_get(&cop->cursor, get_copy_chunk_size(), copyget_flags,
+  op.copy_get(&cop->cursor, get_copy_chunk_size(),
 	      &cop->results.object_size, &cop->results.mtime,
 	      &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
 	      &cop->results.snaps, &cop->results.snap_seq,
@@ -7330,6 +7350,16 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
   copy_ops.erase(cobc->obs.oi.soid);
   cobc->stop_block();
 
+  if (r < 0 && cop->results.started_temp_obj) {
+    dout(10) << __func__ << " deleting partial temp object "
+	     << cop->results.temp_oid << dendl;
+    ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
+    OpContextUPtr ctx = simple_opc_create(tempobc);
+    ctx->op_t->remove(cop->results.temp_oid);
+    ctx->discard_temp_oid = cop->results.temp_oid;
+    simple_opc_submit(std::move(ctx));
+  }
+
   // cancel and requeue proxy ops on this object
   if (!r) {
     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index b208254..92a8698 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -542,10 +542,10 @@ void Objecter::_send_linger(LingerOp *info,
     }
     sl.unlock();
 
-    info->register_tid = _op_submit(o, sul);
+    _op_submit(o, sul, &info->register_tid);
   } else {
     // first send
-    info->register_tid = _op_submit_with_budget(o, sul);
+    _op_submit_with_budget(o, sul, &info->register_tid);
   }
 
   logger->inc(l_osdc_linger_send);
@@ -2125,14 +2125,18 @@ void Objecter::resend_mon_ops()
 
 // read | write ---------------------------
 
-ceph_tid_t Objecter::op_submit(Op *op, int *ctx_budget)
+void Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget)
 {
   shunique_lock rl(rwlock, ceph::acquire_shared);
-  return _op_submit_with_budget(op, rl, ctx_budget);
+  ceph_tid_t tid = 0;
+  if (!ptid)
+    ptid = &tid;
+  _op_submit_with_budget(op, rl, ptid, ctx_budget);
 }
 
-ceph_tid_t Objecter::_op_submit_with_budget(Op *op, shunique_lock& sul,
-					    int *ctx_budget)
+void Objecter::_op_submit_with_budget(Op *op, shunique_lock& sul,
+				      ceph_tid_t *ptid,
+				      int *ctx_budget)
 {
   assert(initialized.read());
 
@@ -2160,7 +2164,7 @@ ceph_tid_t Objecter::_op_submit_with_budget(Op *op, shunique_lock& sul,
 				      op_cancel(tid, -ETIMEDOUT); });
   }
 
-  return _op_submit(op, sul);
+  _op_submit(op, sul, ptid);
 }
 
 void Objecter::_send_op_account(Op *op)
@@ -2242,7 +2246,7 @@ void Objecter::_send_op_account(Op *op)
   }
 }
 
-ceph_tid_t Objecter::_op_submit(Op *op, shunique_lock& sul)
+void Objecter::_op_submit(Op *op, shunique_lock& sul, ceph_tid_t *ptid)
 {
   // rwlock is locked
 
@@ -2335,6 +2339,8 @@ ceph_tid_t Objecter::_op_submit(Op *op, shunique_lock& sul)
   if (check_for_latest_map) {
     _send_op_map_check(op);
   }
+  if (ptid)
+    *ptid = tid;
   op = NULL;
 
   sl.unlock();
@@ -2342,8 +2348,6 @@ ceph_tid_t Objecter::_op_submit(Op *op, shunique_lock& sul)
 
   ldout(cct, 5) << num_unacked.read() << " unacked, " << num_uncommitted.read()
 		<< " uncommitted" << dendl;
-
-  return tid;
 }
 
 int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r)
@@ -3219,7 +3223,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
     m->get_redirect().combine_with_locator(op->target.target_oloc,
 					   op->target.target_oid.name);
     op->target.flags |= CEPH_OSD_FLAG_REDIRECTED;
-    _op_submit(op, sul);
+    _op_submit(op, sul, NULL);
     m->put();
     return;
   }
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 05b29bb..6b519ff 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -747,7 +747,6 @@ struct ObjectOperation {
 
   void copy_get(object_copy_cursor_t *cursor,
 		uint64_t max,
-		uint32_t copyget_flags,
 		uint64_t *out_size,
 		ceph::real_time *out_mtime,
 		std::map<std::string,bufferlist> *out_attrs,
@@ -765,7 +764,6 @@ struct ObjectOperation {
 		int *prval) {
     OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
     osd_op.op.copy_get.max = max;
-    osd_op.op.copy_get.flags = copyget_flags;
     ::encode(*cursor, osd_op.indata);
     ::encode(max, osd_op.indata);
     unsigned p = ops.size() - 1;
@@ -2074,14 +2072,15 @@ private:
 private:
 
   // low-level
-  ceph_tid_t _op_submit(Op *op, shunique_lock& lc);
-  ceph_tid_t _op_submit_with_budget(Op *op, shunique_lock& lc,
-				    int *ctx_budget = NULL);
+  void _op_submit(Op *op, shunique_lock& lc, ceph_tid_t *ptid);
+  void _op_submit_with_budget(Op *op, shunique_lock& lc,
+			      ceph_tid_t *ptid,
+			      int *ctx_budget = NULL);
   inline void unregister_op(Op *op);
 
   // public interface
 public:
-  ceph_tid_t op_submit(Op *op, int *ctx_budget = NULL);
+  void op_submit(Op *op, ceph_tid_t *ptid = NULL, int *ctx_budget = NULL);
   bool is_active() {
     shared_lock l(rwlock);
     return !((!inflight_ops.read()) && linger_ops.empty() &&
@@ -2173,11 +2172,12 @@ public:
   }
 
   // mid-level helpers
-  Op *prepare_mutate_op(const object_t& oid, const object_locator_t& oloc,
-			ObjectOperation& op, const SnapContext& snapc,
-			ceph::real_time mtime, int flags, Context *onack,
-			Context *oncommit, version_t *objver = NULL,
-			osd_reqid_t reqid = osd_reqid_t()) {
+  Op *prepare_mutate_op(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op, const SnapContext& snapc,
+    ceph::real_time mtime, int flags, Context *onack,
+    Context *oncommit, version_t *objver = NULL,
+    osd_reqid_t reqid = osd_reqid_t()) {
     Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags.read() |
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->priority = op.priority;
@@ -2187,20 +2187,25 @@ public:
     o->reqid = reqid;
     return o;
   }
-  ceph_tid_t mutate(const object_t& oid, const object_locator_t& oloc,
-		    ObjectOperation& op, const SnapContext& snapc,
-		    ceph::real_time mtime, int flags, Context *onack,
-		    Context *oncommit, version_t *objver = NULL,
-		    osd_reqid_t reqid = osd_reqid_t()) {
+  ceph_tid_t mutate(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op, const SnapContext& snapc,
+    ceph::real_time mtime, int flags, Context *onack,
+    Context *oncommit, version_t *objver = NULL,
+    osd_reqid_t reqid = osd_reqid_t()) {
     Op *o = prepare_mutate_op(oid, oloc, op, snapc, mtime, flags, onack,
 			      oncommit, objver, reqid);
-    return op_submit(o);
-  }
-  Op *prepare_read_op(const object_t& oid, const object_locator_t& oloc,
-	     ObjectOperation& op,
-	     snapid_t snapid, bufferlist *pbl, int flags,
-	     Context *onack, version_t *objver = NULL,
-		      int *data_offset = NULL) {
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_read_op(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op,
+    snapid_t snapid, bufferlist *pbl, int flags,
+    Context *onack, version_t *objver = NULL,
+    int *data_offset = NULL,
+    uint64_t features = 0) {
     Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags.read() |
 		   CEPH_OSD_FLAG_READ, onack, NULL, objver, data_offset);
     o->priority = op.priority;
@@ -2213,22 +2218,26 @@ public:
     o->out_rval.swap(op.out_rval);
     return o;
   }
-  ceph_tid_t read(const object_t& oid, const object_locator_t& oloc,
-		  ObjectOperation& op,
-		  snapid_t snapid, bufferlist *pbl, int flags,
-		  Context *onack, version_t *objver = NULL,
-		  int *data_offset = NULL,
-		  uint64_t features = 0) {
+  ceph_tid_t read(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op,
+    snapid_t snapid, bufferlist *pbl, int flags,
+    Context *onack, version_t *objver = NULL,
+    int *data_offset = NULL,
+    uint64_t features = 0) {
     Op *o = prepare_read_op(oid, oloc, op, snapid, pbl, flags, onack, objver,
 			    data_offset);
     if (features)
       o->features = features;
-    return op_submit(o);
-  }
-  ceph_tid_t pg_read(uint32_t hash, object_locator_t oloc,
-		     ObjectOperation& op, bufferlist *pbl, int flags,
-		     Context *onack, epoch_t *reply_epoch,
-		     int *ctx_budget) {
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_pg_read_op(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, bufferlist *pbl, int flags,
+    Context *onack, epoch_t *reply_epoch,
+    int *ctx_budget) {
     Op *o = new Op(object_t(), oloc,
 		   op.ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ,
 		   onack, NULL, NULL);
@@ -2245,7 +2254,18 @@ public:
       // budget is tracked by listing context
       o->ctx_budgeted = true;
     }
-    return op_submit(o, ctx_budget);
+    return o;
+  }
+  ceph_tid_t pg_read(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, bufferlist *pbl, int flags,
+    Context *onack, epoch_t *reply_epoch,
+    int *ctx_budget) {
+    Op *o = prepare_pg_read_op(hash, oloc, op, pbl, flags,
+			       onack, reply_epoch, ctx_budget);
+    ceph_tid_t tid;
+    op_submit(o, &tid, ctx_budget);
+    return tid;
   }
 
   // caller owns a ref
@@ -2297,10 +2317,11 @@ public:
 
 
   // high-level helpers
-  ceph_tid_t stat(const object_t& oid, const object_locator_t& oloc,
-		  snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
-		  int flags, Context *onfinish, version_t *objver = NULL,
-		  ObjectOperation *extra_ops = NULL) {
+  Op *prepare_stat_op(
+    const object_t& oid, const object_locator_t& oloc,
+    snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_STAT;
@@ -2309,13 +2330,25 @@ public:
 		   CEPH_OSD_FLAG_READ, fin, 0, objver);
     o->snapid = snap;
     o->outbl = &fin->bl;
-    return op_submit(o);
+    return o;
+  }
+  ceph_tid_t stat(
+    const object_t& oid, const object_locator_t& oloc,
+    snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_stat_op(oid, oloc, snap, psize, pmtime, flags,
+			    onfinish, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
 
-  ceph_tid_t read(const object_t& oid, const object_locator_t& oloc,
-		  uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
-		  int flags, Context *onfinish, version_t *objver = NULL,
-		  ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+  Op *prepare_read_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_READ;
@@ -2328,7 +2361,18 @@ public:
 		   CEPH_OSD_FLAG_READ, onfinish, 0, objver);
     o->snapid = snap;
     o->outbl = pbl;
-    return op_submit(o);
+    return o;
+  }
+  ceph_tid_t read(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_read_op(oid, oloc, off, len, snap, pbl, flags,
+			    onfinish, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
 
   ceph_tid_t read_trunc(const object_t& oid, const object_locator_t& oloc,
@@ -2349,7 +2393,9 @@ public:
 		   CEPH_OSD_FLAG_READ, onfinish, 0, objver);
     o->snapid = snap;
     o->outbl = pbl;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t mapext(const object_t& oid, const object_locator_t& oloc,
 		    uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
@@ -2366,7 +2412,9 @@ public:
 		   CEPH_OSD_FLAG_READ, onfinish, 0, objver);
     o->snapid = snap;
     o->outbl = pbl;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t getxattr(const object_t& oid, const object_locator_t& oloc,
 	     const char *name, snapid_t snap, bufferlist *pbl, int flags,
@@ -2383,7 +2431,9 @@ public:
 		   CEPH_OSD_FLAG_READ, onfinish, 0, objver);
     o->snapid = snap;
     o->outbl = pbl;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
 
   ceph_tid_t getxattrs(const object_t& oid, const object_locator_t& oloc,
@@ -2398,7 +2448,9 @@ public:
 		   CEPH_OSD_FLAG_READ, fin, 0, objver);
     o->snapid = snap;
     o->outbl = &fin->bl;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
 
   ceph_tid_t read_full(const object_t& oid, const object_locator_t& oloc,
@@ -2420,13 +2472,16 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
-  }
-  ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
-		   uint64_t off, uint64_t len, const SnapContext& snapc,
-		   const bufferlist &bl, ceph::real_time mtime, int flags,
-		   Context *onack, Context *oncommit, version_t *objver = NULL,
-		   ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_write_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *onack, Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_WRITE;
@@ -2440,14 +2495,27 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    return o;
   }
-  ceph_tid_t append(const object_t& oid, const object_locator_t& oloc,
-		    uint64_t len, const SnapContext& snapc,
-		    const bufferlist &bl, ceph::real_time mtime, int flags,
-		    Context *onack, Context *oncommit,
-		    version_t *objver = NULL,
-		    ObjectOperation *extra_ops = NULL) {
+  ceph_tid_t write(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *onack, Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_write_op(oid, oloc, off, len, snapc, bl, mtime, flags,
+			     onack, oncommit, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_append_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *onack, Context *oncommit,
+    version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_APPEND;
@@ -2460,7 +2528,20 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    return o;
+  }
+  ceph_tid_t append(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *onack, Context *oncommit,
+    version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_append_op(oid, oloc, len, snapc, bl, mtime, flags,
+			      onack, oncommit, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t write_trunc(const object_t& oid, const object_locator_t& oloc,
 			 uint64_t off, uint64_t len, const SnapContext& snapc,
@@ -2482,13 +2563,16 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
-  }
-  ceph_tid_t write_full(const object_t& oid, const object_locator_t& oloc,
-			const SnapContext& snapc, const bufferlist &bl,
-			ceph::real_time mtime, int flags, Context *onack,
-			Context *oncommit, version_t *objver = NULL,
-			ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_write_full_op(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, const bufferlist &bl,
+    ceph::real_time mtime, int flags, Context *onack,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_WRITEFULL;
@@ -2500,7 +2584,19 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    return o;
+  }
+  ceph_tid_t write_full(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, const bufferlist &bl,
+    ceph::real_time mtime, int flags, Context *onack,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_write_full_op(oid, oloc, snapc, bl, mtime, flags,
+				  onack, oncommit, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t trunc(const object_t& oid, const object_locator_t& oloc,
 		   const SnapContext& snapc, ceph::real_time mtime, int flags,
@@ -2517,7 +2613,9 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t zero(const object_t& oid, const object_locator_t& oloc,
 		  uint64_t off, uint64_t len, const SnapContext& snapc,
@@ -2532,7 +2630,9 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t rollback_object(const object_t& oid, const object_locator_t& oloc,
 			     const SnapContext& snapc, snapid_t snapid,
@@ -2547,7 +2647,9 @@ public:
 		   objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t create(const object_t& oid, const object_locator_t& oloc,
 		    const SnapContext& snapc, ceph::real_time mtime, int global_flags,
@@ -2562,12 +2664,15 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
-  }
-  ceph_tid_t remove(const object_t& oid, const object_locator_t& oloc,
-	       const SnapContext& snapc, ceph::real_time mtime, int flags,
-	       Context *onack, Context *oncommit,
-	       version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_remove_op(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, ceph::real_time mtime, int flags,
+    Context *onack, Context *oncommit,
+    version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_DELETE;
@@ -2575,7 +2680,18 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    return o;
+  }
+  ceph_tid_t remove(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, ceph::real_time mtime, int flags,
+    Context *onack, Context *oncommit,
+    version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_remove_op(oid, oloc, snapc, mtime, flags,
+			      onack, oncommit, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
 
   ceph_tid_t setxattr(const object_t& oid, const object_locator_t& oloc,
@@ -2595,7 +2711,9 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
   ceph_tid_t removexattr(const object_t& oid, const object_locator_t& oloc,
 	      const char *name, const SnapContext& snapc,
@@ -2613,7 +2731,9 @@ public:
 		   CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
-    return op_submit(o);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
   }
 
   void list_nobjects(NListContext *p, Context *onfinish);
diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py
index 7792013..2dfe6b6 100755
--- a/src/pybind/ceph_rest_api.py
+++ b/src/pybind/ceph_rest_api.py
@@ -224,15 +224,6 @@ def generate_url_and_params(app, sig, flavor):
         # prefixes go in the URL path
         if desc.t == CephPrefix:
             url += '/' + desc.instance.prefix
-        # CephChoices with 1 required string (not --) do too, unless
-        # we've already started collecting params, in which case they
-        # too are params
-        elif (desc.t == CephChoices and
-              len(desc.instance.strings) == 1 and
-              desc.req and
-              not str(desc.instance).startswith('--') and
-              not params):
-            url += '/' + str(desc.instance)
         else:
             # tell/<target> is a weird case; the URL includes what
             # would everywhere else be a parameter
diff --git a/src/rgw/librgw.cc b/src/rgw/librgw.cc
index 220d024..37414fc 100644
--- a/src/rgw/librgw.cc
+++ b/src/rgw/librgw.cc
@@ -545,16 +545,29 @@ namespace rgw {
   }
 
   int RGWLibRequest::read_permissions(RGWOp* op) {
+    /* bucket and object ops */
     int ret =
       rgw_build_bucket_policies(rgwlib.get_store(), get_state());
     if (ret < 0) {
-      ldout(get_state()->cct, 10) << "read_permissions on "
+      ldout(get_state()->cct, 10) << "read_permissions (bucket policy) on "
 				  << get_state()->bucket << ":"
 				  << get_state()->object
 				  << " only_bucket=" << only_bucket()
 				  << " ret=" << ret << dendl;
       if (ret == -ENODATA)
 	ret = -EACCES;
+    } else if (! only_bucket()) {
+      /* object ops */
+      ret = rgw_build_object_policies(rgwlib.get_store(), get_state(),
+				      op->prefetch_data());
+      if (ret < 0) {
+	ldout(get_state()->cct, 10) << "read_permissions (object policy) on"
+				    << get_state()->bucket << ":"
+				    << get_state()->object
+				    << " ret=" << ret << dendl;
+	if (ret == -ENODATA)
+	  ret = -EACCES;
+      }
     }
     return ret;
   } /* RGWLibRequest::read_permissions */
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 1f3cb61..f95afc6 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -2293,7 +2293,6 @@ int main(int argc, char **argv)
     } else if (ceph_argparse_witharg(args, i, &val, "--zone-new-name", (char*)NULL)) {
       zone_new_name = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--endpoints", (char*)NULL)) {
-      list<string>::iterator iter;
       get_str_list(val, endpoints);
     } else if (ceph_argparse_witharg(args, i, &val, "--source-zone", (char*)NULL)) {
       source_zone_name = val;
@@ -2308,7 +2307,7 @@ int main(int argc, char **argv)
     tenant = user_id.tenant;
   } else {
     if (user_id.empty()) {
-      cerr << "ERROR: --tennant is set, but there's no user ID" << std::endl;
+      cerr << "ERROR: --tenant is set, but there's no user ID" << std::endl;
       return EINVAL;
     }
     user_id.tenant = tenant;
@@ -2453,7 +2452,7 @@ int main(int argc, char **argv)
     case OPT_PERIOD_DELETE:
       {
 	if (period_id.empty()) {
-	  cerr << "missing realm name or id" << std::endl;
+	  cerr << "missing period id" << std::endl;
 	  return -EINVAL;
 	}
 	RGWPeriod period(period_id);
@@ -3315,13 +3314,18 @@ int main(int argc, char **argv)
 	if(zone.realm_id.empty()) {
 	  RGWRealm realm(realm_id, realm_name);
 	  int ret = realm.init(g_ceph_context, store);
-	  if (ret < 0) {
+	  if (ret < 0 && ret != -ENOENT) {
 	    cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
 	    return -ret;
 	  }
 	  zone.realm_id = realm.get_id();
 	}
 
+	if( !zone_name.empty() && !zone.get_name().empty() && zone.get_name() != zone_name) {
+	  cerr << "Error: zone name" << zone_name << " is different than the zone name " << zone.get_name() << " in the provided json " << std::endl;
+	  return -EINVAL;
+	}
+
         if (zone.get_name().empty()) {
           zone.set_name(zone_name);
           if (zone.get_name().empty()) {
@@ -3330,11 +3334,6 @@ int main(int argc, char **argv)
           }
         }
 
-	if(zone.get_name() != zone_name) {
-	  cerr << "Error: zone name" << zone_name << " is different than the zone name " << zone.get_name() << " in the provided json " << std::endl;
-	  return -EINVAL;
-	}
-
         zone_name = zone.get_name();
 
         if (zone.get_id().empty()) {
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 90d2ae5..037703e 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -1081,6 +1081,7 @@ struct req_info {
   const char *method;
   string script_uri;
   string request_uri;
+  string request_uri_aws4;
   string effective_uri;
   string request_params;
   string domain;
diff --git a/src/rgw/rgw_ldap.h b/src/rgw/rgw_ldap.h
index 46b05ff..02eb61e 100644
--- a/src/rgw/rgw_ldap.h
+++ b/src/rgw/rgw_ldap.h
@@ -4,8 +4,10 @@
 #ifndef RGW_LDAP_H
 #define RGW_LDAP_H
 
+#if defined(HAVE_OPENLDAP)
 #define LDAP_DEPRECATED 1
 #include "ldap.h"
+#endif
 
 #include <stdint.h>
 #include <tuple>
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 66ecd55..c781eb9 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -398,7 +398,8 @@ int rgw_build_bucket_policies(RGWRados* store, struct req_state* s)
  * only_bucket: If true, reads the bucket ACL rather than the object ACL.
  * Returns: 0 on success, -ERR# otherwise.
  */
-static int rgw_build_object_policies(RGWRados *store, struct req_state *s, bool prefetch_data)
+int rgw_build_object_policies(RGWRados *store, struct req_state *s,
+			      bool prefetch_data)
 {
   int ret = 0;
 
@@ -2994,7 +2995,8 @@ void RGWDeleteObj::execute()
     return;
   }
 
-  rgw_obj obj(s->bucket, s->object);
+  rgw_obj obj(s->bucket, s->object.name);
+  obj.set_instance(s->object.instance);
   map<string, bufferlist> attrs;
 
 
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index e3ecd60..825dd93 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -1338,6 +1338,8 @@ public:
 };
 
 extern int rgw_build_bucket_policies(RGWRados* store, struct req_state* s);
+extern int rgw_build_object_policies(RGWRados *store, struct req_state *s,
+				    bool prefetch_data);
 
 static inline int put_data_and_throttle(RGWPutObjProcessor *processor,
 					bufferlist& data, off_t ofs,
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index b60999f..a165b65 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -1590,6 +1590,11 @@ int RGWREST::preprocess(struct req_state *s, RGWClientIO* cio)
 {
   req_info& info = s->info;
 
+  /* save the request uri used to hash on the client side. request_uri may suffer
+     modifications as part of the bucket encoding in the subdomain calling format.
+     request_uri_aws4 will be used under aws4 auth */
+  s->info.request_uri_aws4 = s->info.request_uri;
+
   s->cio = cio;
   if (info.host.size()) {
     ldout(s->cct, 10) << "host=" << info.host << dendl;
diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc
index f45e94a..78fe0ae 100644
--- a/src/rgw/rgw_rest_conn.cc
+++ b/src/rgw/rgw_rest_conn.cc
@@ -199,7 +199,7 @@ int RGWRESTConn::get_resource(const string& resource,
 
   map<string, string> headers;
   if (extra_headers) {
-    headers.insert(extra_params->begin(), extra_params->end());
+    headers.insert(extra_headers->begin(), extra_headers->end());
   }
 
   ret = req.get_resource(key, headers, resource, mgr);
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index e4731ba..c912fc2 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -3409,7 +3409,7 @@ int RGW_Auth_S3::authorize_v4(RGWRados *store, struct req_state *s)
    * that SigV4 typically does. this code follows the same approach that boto library
    * see auth.py:canonical_uri(...) */
 
-  s->aws4_auth->canonical_uri = s->info.request_uri;
+  s->aws4_auth->canonical_uri = s->info.request_uri_aws4;
 
   if (s->aws4_auth->canonical_uri.empty()) {
     s->aws4_auth->canonical_uri = "/";
@@ -3437,8 +3437,20 @@ int RGW_Auth_S3::authorize_v4(RGWRados *store, struct req_state *s)
         string encoded_key;
         string encoded_val;
         if (key != "X-Amz-Credential") {
-          aws4_uri_encode(key, encoded_key);
-          aws4_uri_encode(val, encoded_val);
+          string key_decoded;
+          url_decode(key, key_decoded);
+          if (key.length() != key_decoded.length()) {
+            encoded_key = key;
+          } else {
+            aws4_uri_encode(key, encoded_key);
+          }
+          string val_decoded;
+          url_decode(val, val_decoded);
+          if (val.length() != val_decoded.length()) {
+            encoded_val = val;
+          } else {
+            aws4_uri_encode(val, encoded_val);
+          }
         } else {
           encoded_key = key;
           encoded_val = val;
diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc
index 38f92ed..922a904 100644
--- a/src/rgw/rgw_sync.cc
+++ b/src/rgw/rgw_sync.cc
@@ -1868,13 +1868,28 @@ int RGWRemoteMetaLog::run_sync()
   }
 
   RGWObjectCtx obj_ctx(store, NULL);
+  int r = 0;
 
   // get shard count and oldest log period from master
   rgw_mdlog_info mdlog_info;
-  int r = read_log_info(&mdlog_info);
-  if (r < 0) {
-    lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
-    return r;
+  for (;;) {
+    if (going_down.read()) {
+      ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = read_log_info(&mdlog_info);
+    if (r == -EIO) {
+      // keep retrying if master isn't alive
+      ldout(store->ctx(), 10) << __func__ << "(): waiting for master.." << dendl;
+      backoff.backoff_sleep();
+      continue;
+    }
+    backoff.reset();
+    if (r < 0) {
+      lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+      return r;
+    }
+    break;
   }
 
   do {
@@ -1924,7 +1939,7 @@ int RGWRemoteMetaLog::run_sync()
   auto num_shards = sync_status.sync_info.num_shards;
   if (num_shards != mdlog_info.num_shards) {
     lderr(store->ctx()) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
-    return r;
+    return -EINVAL;
   }
 
   RGWPeriodHistory::Cursor cursor;
diff --git a/src/test/cli/ceph-authtool/help.t b/src/test/cli/ceph-authtool/help.t
index 062c967..9a6c883 100644
--- a/src/test/cli/ceph-authtool/help.t
+++ b/src/test/cli/ceph-authtool/help.t
@@ -14,12 +14,13 @@
                                   specified entityname
     --gen-print-key               will generate a new secret key without set it
                                   to the keyringfile, prints the secret to stdout
-    --import-keyring              will import the content of a given keyring
+    --import-keyring FILE         will import the content of a given keyring
                                   into the keyringfile
-    -u, --set-uid                 sets the auid (authenticated user id) for the
+    -n NAME, --name NAME          specify entityname to operate on
+    -u AUID, --set-uid AUID       sets the auid (authenticated user id) for the
                                   specified entityname
-    -a, --add-key                 will add an encoded key to the keyring
-    --cap subsystem capability    will set the capability for given subsystem
-    --caps capsfile               will set all of capabilities associated with a
+    -a BASE64, --add-key BASE64   will add an encoded key to the keyring
+    --cap SUBSYSTEM CAPABILITY    will set the capability for given subsystem
+    --caps CAPSFILE               will set all of capabilities associated with a
                                   given key, for all subsystems
   [1]
diff --git a/src/test/cli/ceph-authtool/manpage.t b/src/test/cli/ceph-authtool/manpage.t
index a9e1408..f84b794 100644
--- a/src/test/cli/ceph-authtool/manpage.t
+++ b/src/test/cli/ceph-authtool/manpage.t
@@ -13,13 +13,14 @@
                                   specified entityname
     --gen-print-key               will generate a new secret key without set it
                                   to the keyringfile, prints the secret to stdout
-    --import-keyring              will import the content of a given keyring
+    --import-keyring FILE         will import the content of a given keyring
                                   into the keyringfile
-    -u, --set-uid                 sets the auid (authenticated user id) for the
+    -n NAME, --name NAME          specify entityname to operate on
+    -u AUID, --set-uid AUID       sets the auid (authenticated user id) for the
                                   specified entityname
-    -a, --add-key                 will add an encoded key to the keyring
-    --cap subsystem capability    will set the capability for given subsystem
-    --caps capsfile               will set all of capabilities associated with a
+    -a BASE64, --add-key BASE64   will add an encoded key to the keyring
+    --cap SUBSYSTEM CAPABILITY    will set the capability for given subsystem
+    --caps CAPSFILE               will set all of capabilities associated with a
                                   given key, for all subsystems
   [1]
 
diff --git a/src/test/cli/ceph-authtool/simple.t b/src/test/cli/ceph-authtool/simple.t
index b86476a..35905ad 100644
--- a/src/test/cli/ceph-authtool/simple.t
+++ b/src/test/cli/ceph-authtool/simple.t
@@ -13,12 +13,13 @@
                                   specified entityname
     --gen-print-key               will generate a new secret key without set it
                                   to the keyringfile, prints the secret to stdout
-    --import-keyring              will import the content of a given keyring
+    --import-keyring FILE         will import the content of a given keyring
                                   into the keyringfile
-    -u, --set-uid                 sets the auid (authenticated user id) for the
+    -n NAME, --name NAME          specify entityname to operate on
+    -u AUID, --set-uid AUID       sets the auid (authenticated user id) for the
                                   specified entityname
-    -a, --add-key                 will add an encoded key to the keyring
-    --cap subsystem capability    will set the capability for given subsystem
-    --caps capsfile               will set all of capabilities associated with a
+    -a BASE64, --add-key BASE64   will add an encoded key to the keyring
+    --cap SUBSYSTEM CAPABILITY    will set the capability for given subsystem
+    --caps CAPSFILE               will set all of capabilities associated with a
                                   given key, for all subsystems
   [1]
diff --git a/src/test/encoding/check-generated.sh b/src/test/encoding/check-generated.sh
index ee55fab..c34fce8 100755
--- a/src/test/encoding/check-generated.sh
+++ b/src/test/encoding/check-generated.sh
@@ -1,4 +1,4 @@
-#!/bin/sh -e
+#!/bin/bash -e
 
 source ../qa/workunits/ceph-helpers.sh
 
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 11dfc50..22919bd 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -192,6 +192,10 @@ TYPE_FEATUREFUL(InodeStore)
 TYPE_FEATUREFUL(MDSMap)
 TYPE_FEATUREFUL(MDSMap::mds_info_t)
 
+#include "mds/FSMap.h"
+//TYPE_FEATUREFUL(Filesystem)
+TYPE_FEATUREFUL(FSMap)
+
 #include "mds/Capability.h"
 TYPE_NOCOPY(Capability)
 
diff --git a/src/test/journal/test_FutureImpl.cc b/src/test/journal/test_FutureImpl.cc
index 51e19cf..eb5f806 100644
--- a/src/test/journal/test_FutureImpl.cc
+++ b/src/test/journal/test_FutureImpl.cc
@@ -25,12 +25,11 @@ public:
     }
   };
 
-  journal::FutureImplPtr create_future(journal::JournalMetadataPtr metadata,
-                                       uint64_t tag_tid, uint64_t entry_tid,
+  journal::FutureImplPtr create_future(uint64_t tag_tid, uint64_t entry_tid,
                                        uint64_t commit_tid,
                                        const journal::FutureImplPtr &prev =
                                          journal::FutureImplPtr()) {
-    journal::FutureImplPtr future(new journal::FutureImpl(metadata, tag_tid,
+    journal::FutureImplPtr future(new journal::FutureImpl(tag_tid,
                                                           entry_tid,
                                                           commit_tid));
     future->init(prev);
@@ -50,7 +49,7 @@ TEST_F(TestFutureImpl, Getters) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   ASSERT_EQ(234U, future->get_tag_tid());
   ASSERT_EQ(123U, future->get_entry_tid());
   ASSERT_EQ(456U, future->get_commit_tid());
@@ -63,7 +62,7 @@ TEST_F(TestFutureImpl, Attach) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   ASSERT_FALSE(future->attach(&m_flush_handler));
   ASSERT_EQ(1U, m_flush_handler.refs);
 }
@@ -75,7 +74,7 @@ TEST_F(TestFutureImpl, AttachWithPendingFlush) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   future->flush(NULL);
 
   ASSERT_TRUE(future->attach(&m_flush_handler));
@@ -89,7 +88,7 @@ TEST_F(TestFutureImpl, Detach) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   ASSERT_FALSE(future->attach(&m_flush_handler));
   future->detach();
   ASSERT_EQ(0U, m_flush_handler.refs);
@@ -102,7 +101,7 @@ TEST_F(TestFutureImpl, DetachImplicit) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   ASSERT_FALSE(future->attach(&m_flush_handler));
   future.reset();
   ASSERT_EQ(0U, m_flush_handler.refs);
@@ -115,7 +114,7 @@ TEST_F(TestFutureImpl, Flush) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   ASSERT_FALSE(future->attach(&m_flush_handler));
 
   C_SaferCond cond;
@@ -133,7 +132,7 @@ TEST_F(TestFutureImpl, FlushWithoutContext) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   ASSERT_FALSE(future->attach(&m_flush_handler));
 
   future->flush(NULL);
@@ -150,10 +149,10 @@ TEST_F(TestFutureImpl, FlushChain) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future1 = create_future(metadata, 234, 123, 456);
-  journal::FutureImplPtr future2 = create_future(metadata, 234, 124, 457,
+  journal::FutureImplPtr future1 = create_future(234, 123, 456);
+  journal::FutureImplPtr future2 = create_future(234, 124, 457,
                                                  future1);
-  journal::FutureImplPtr future3 = create_future(metadata, 235, 1, 458,
+  journal::FutureImplPtr future3 = create_future(235, 1, 458,
                                                  future2);
   ASSERT_FALSE(future1->attach(&m_flush_handler));
   ASSERT_FALSE(future2->attach(&m_flush_handler));
@@ -184,8 +183,8 @@ TEST_F(TestFutureImpl, FlushInProgress) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future1 = create_future(metadata, 234, 123, 456);
-  journal::FutureImplPtr future2 = create_future(metadata, 234, 124, 457,
+  journal::FutureImplPtr future1 = create_future(234, 123, 456);
+  journal::FutureImplPtr future2 = create_future(234, 124, 457,
                                                  future1);
   ASSERT_FALSE(future1->attach(&m_flush_handler));
   ASSERT_FALSE(future2->attach(&m_flush_handler));
@@ -206,7 +205,7 @@ TEST_F(TestFutureImpl, FlushAlreadyComplete) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 123, 456);
+  journal::FutureImplPtr future = create_future(234, 123, 456);
   future->safe(-EIO);
 
   C_SaferCond cond;
@@ -221,7 +220,7 @@ TEST_F(TestFutureImpl, Wait) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 1, 456);
+  journal::FutureImplPtr future = create_future(234, 1, 456);
 
   C_SaferCond cond;
   future->wait(&cond);
@@ -236,7 +235,7 @@ TEST_F(TestFutureImpl, WaitAlreadyComplete) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future = create_future(metadata, 234, 1, 456);
+  journal::FutureImplPtr future = create_future(234, 1, 456);
   future->safe(-EEXIST);
 
   C_SaferCond cond;
@@ -251,8 +250,8 @@ TEST_F(TestFutureImpl, SafePreservesError) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future1 = create_future(metadata, 234, 123, 456);
-  journal::FutureImplPtr future2 = create_future(metadata, 234, 124, 457,
+  journal::FutureImplPtr future1 = create_future(234, 123, 456);
+  journal::FutureImplPtr future2 = create_future(234, 124, 457,
                                                  future1);
 
   future1->safe(-EIO);
@@ -268,8 +267,8 @@ TEST_F(TestFutureImpl, ConsistentPreservesError) {
   journal::JournalMetadataPtr metadata = create_metadata(oid);
   ASSERT_EQ(0, init_metadata(metadata));
 
-  journal::FutureImplPtr future1 = create_future(metadata, 234, 123, 456);
-  journal::FutureImplPtr future2 = create_future(metadata, 234, 124, 457,
+  journal::FutureImplPtr future1 = create_future(234, 123, 456);
+  journal::FutureImplPtr future2 = create_future(234, 124, 457,
                                                  future1);
 
   future2->safe(-EEXIST);
diff --git a/src/test/journal/test_ObjectRecorder.cc b/src/test/journal/test_ObjectRecorder.cc
index 65d74b6..f26e526 100644
--- a/src/test/journal/test_ObjectRecorder.cc
+++ b/src/test/journal/test_ObjectRecorder.cc
@@ -67,11 +67,10 @@ public:
     m_flush_age = i;
   }
 
-  journal::AppendBuffer create_append_buffer(journal::JournalMetadataPtr metadata,
-                                             uint64_t tag_tid, uint64_t entry_tid,
+  journal::AppendBuffer create_append_buffer(uint64_t tag_tid, uint64_t entry_tid,
                                              const std::string &payload) {
-    journal::FutureImplPtr future(new journal::FutureImpl(metadata, tag_tid,
-                                                          entry_tid, 456));
+    journal::FutureImplPtr future(new journal::FutureImpl(tag_tid, entry_tid,
+                                                          456));
     future->init(journal::FutureImplPtr());
 
     bufferlist bl;
@@ -98,16 +97,14 @@ TEST_F(TestObjectRecorder, Append) {
 
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               "payload");
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1};
   ASSERT_FALSE(object->append(append_buffers));
   ASSERT_EQ(1U, object->get_pending_appends());
 
-  journal::AppendBuffer append_buffer2 = create_append_buffer(metadata,
-                                                              234, 124,
+  journal::AppendBuffer append_buffer2 = create_append_buffer(234, 124,
                                                               "payload");
   append_buffers = {append_buffer2};
   ASSERT_FALSE(object->append(append_buffers));
@@ -129,16 +126,14 @@ TEST_F(TestObjectRecorder, AppendFlushByCount) {
   set_flush_interval(2);
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               "payload");
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1};
   ASSERT_FALSE(object->append(append_buffers));
   ASSERT_EQ(1U, object->get_pending_appends());
 
-  journal::AppendBuffer append_buffer2 = create_append_buffer(metadata,
-                                                              234, 124,
+  journal::AppendBuffer append_buffer2 = create_append_buffer(234, 124,
                                                               "payload");
   append_buffers = {append_buffer2};
   ASSERT_FALSE(object->append(append_buffers));
@@ -159,16 +154,14 @@ TEST_F(TestObjectRecorder, AppendFlushByBytes) {
   set_flush_bytes(10);
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               "payload");
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1};
   ASSERT_FALSE(object->append(append_buffers));
   ASSERT_EQ(1U, object->get_pending_appends());
 
-  journal::AppendBuffer append_buffer2 = create_append_buffer(metadata,
-                                                              234, 124,
+  journal::AppendBuffer append_buffer2 = create_append_buffer(234, 124,
                                                               "payload");
   append_buffers = {append_buffer2};
   ASSERT_FALSE(object->append(append_buffers));
@@ -189,15 +182,13 @@ TEST_F(TestObjectRecorder, AppendFlushByAge) {
   set_flush_age(0.1);
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               "payload");
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1};
   ASSERT_FALSE(object->append(append_buffers));
 
-  journal::AppendBuffer append_buffer2 = create_append_buffer(metadata,
-                                                              234, 124,
+  journal::AppendBuffer append_buffer2 = create_append_buffer(234, 124,
                                                               "payload");
   append_buffers = {append_buffer2};
   ASSERT_FALSE(object->append(append_buffers));
@@ -218,15 +209,13 @@ TEST_F(TestObjectRecorder, AppendFilledObject) {
   journal::ObjectRecorderPtr object = create_object(oid, 12);
 
   std::string payload(2048, '1');
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               payload);
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1};
   ASSERT_FALSE(object->append(append_buffers));
 
-  journal::AppendBuffer append_buffer2 = create_append_buffer(metadata,
-                                                              234, 124,
+  journal::AppendBuffer append_buffer2 = create_append_buffer(234, 124,
                                                               payload);
   append_buffers = {append_buffer2};
   ASSERT_TRUE(object->append(append_buffers));
@@ -246,8 +235,7 @@ TEST_F(TestObjectRecorder, Flush) {
 
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               "payload");
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1};
@@ -273,8 +261,7 @@ TEST_F(TestObjectRecorder, FlushFuture) {
 
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer = create_append_buffer(metadata,
-                                                             234, 123,
+  journal::AppendBuffer append_buffer = create_append_buffer(234, 123,
                                                              "payload");
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer};
@@ -298,8 +285,7 @@ TEST_F(TestObjectRecorder, FlushDetachedFuture) {
 
   journal::ObjectRecorderPtr object = create_object(oid, 24);
 
-  journal::AppendBuffer append_buffer = create_append_buffer(metadata,
-                                                             234, 123,
+  journal::AppendBuffer append_buffer = create_append_buffer(234, 123,
                                                              "payload");
 
   journal::AppendBuffers append_buffers;
@@ -326,11 +312,9 @@ TEST_F(TestObjectRecorder, Overflow) {
   journal::ObjectRecorderPtr object2 = create_object(oid, 12);
 
   std::string payload(2048, '1');
-  journal::AppendBuffer append_buffer1 = create_append_buffer(metadata,
-                                                              234, 123,
+  journal::AppendBuffer append_buffer1 = create_append_buffer(234, 123,
                                                               payload);
-  journal::AppendBuffer append_buffer2 = create_append_buffer(metadata,
-                                                              234, 124,
+  journal::AppendBuffer append_buffer2 = create_append_buffer(234, 124,
                                                               payload);
   journal::AppendBuffers append_buffers;
   append_buffers = {append_buffer1, append_buffer2};
@@ -341,8 +325,7 @@ TEST_F(TestObjectRecorder, Overflow) {
   ASSERT_EQ(0, cond.wait());
   ASSERT_EQ(0U, object1->get_pending_appends());
 
-  journal::AppendBuffer append_buffer3 = create_append_buffer(metadata,
-                                                              456, 123,
+  journal::AppendBuffer append_buffer3 = create_append_buffer(456, 123,
                                                               payload);
   append_buffers = {append_buffer3};
 
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index ed19e00..7d89b21 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -95,6 +95,88 @@ TEST_F(LibRadosMiscPP, LongNamePP) {
   ASSERT_EQ(-ENAMETOOLONG, ioctx.write(string(maxlen*2, 'a').c_str(), bl, bl.length(), 0));
 }
 
+TEST_F(LibRadosMiscPP, LongLocatorPP) {
+  bufferlist bl;
+  bl.append("content");
+  int maxlen = g_conf->osd_max_object_name_len;
+  ioctx.locator_set_key(
+    string((maxlen/2), 'a'));
+  ASSERT_EQ(
+    0,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.locator_set_key(
+    string(maxlen - 1, 'a'));
+  ASSERT_EQ(
+    0,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.locator_set_key(
+    string(maxlen, 'a'));
+  ASSERT_EQ(
+    0,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.locator_set_key(
+    string(maxlen+1, 'a'));
+  ASSERT_EQ(
+    -ENAMETOOLONG,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.locator_set_key(
+    string((maxlen*2), 'a'));
+  ASSERT_EQ(
+    -ENAMETOOLONG,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+}
+
+TEST_F(LibRadosMiscPP, LongNSpacePP) {
+  bufferlist bl;
+  bl.append("content");
+  int maxlen = g_conf->osd_max_object_namespace_len;
+  ioctx.set_namespace(
+    string((maxlen/2), 'a'));
+  ASSERT_EQ(
+    0,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.set_namespace(
+    string(maxlen - 1, 'a'));
+  ASSERT_EQ(
+    0,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.set_namespace(
+    string(maxlen, 'a'));
+  ASSERT_EQ(
+    0,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.set_namespace(
+    string(maxlen+1, 'a'));
+  ASSERT_EQ(
+    -ENAMETOOLONG,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+  ioctx.set_namespace(
+    string((maxlen*2), 'a'));
+  ASSERT_EQ(
+    -ENAMETOOLONG,
+    ioctx.write(
+      string("a").c_str(),
+      bl, bl.length(), 0));
+}
+
 TEST_F(LibRadosMiscPP, LongAttrNamePP) {
   bufferlist bl;
   bl.append("content");
@@ -734,15 +816,29 @@ std::string LibRadosTwoPoolsECPP::src_pool_name;
 
 //copy_from between ecpool and no-ecpool.
 TEST_F(LibRadosTwoPoolsECPP, CopyFrom) {
-  //create object w/ omapheader
+  bufferlist z;
+  z.append_zero(4194304*2);
   bufferlist b;
   b.append("copyfrom");
-  ASSERT_EQ(0, src_ioctx.omap_set_header("foo", b));
 
-  version_t uv = src_ioctx.get_last_version();
-  ObjectWriteOperation op;
-  op.copy_from("foo", src_ioctx, uv);
-  ASSERT_EQ(-EOPNOTSUPP, ioctx.operate("foo.copy", &op));
+  // create big object w/ omapheader
+  {
+    ASSERT_EQ(0, src_ioctx.write_full("foo", z));
+    ASSERT_EQ(0, src_ioctx.omap_set_header("foo", b));
+    version_t uv = src_ioctx.get_last_version();
+    ObjectWriteOperation op;
+    op.copy_from("foo", src_ioctx, uv);
+    ASSERT_EQ(-EOPNOTSUPP, ioctx.operate("foo.copy", &op));
+  }
+
+  // same with small object
+  {
+    ASSERT_EQ(0, src_ioctx.omap_set_header("bar", b));
+    version_t uv = src_ioctx.get_last_version();
+    ObjectWriteOperation op;
+    op.copy_from("bar", src_ioctx, uv);
+    ASSERT_EQ(-EOPNOTSUPP, ioctx.operate("bar.copy", &op));
+  }
 }
 
 TEST_F(LibRadosMiscPP, CopyScrubPP) {
diff --git a/src/test/librbd/test_mock_Journal.cc b/src/test/librbd/test_mock_Journal.cc
index 881ac16..9ebea8f 100644
--- a/src/test/librbd/test_mock_Journal.cc
+++ b/src/test/librbd/test_mock_Journal.cc
@@ -944,6 +944,7 @@ TEST_F(TestMockJournal, EventAndIOCommitOrder) {
 
   // commit journal event followed by IO event (standard)
   on_journal_safe1->complete(0);
+  ictx->op_work_queue->drain();
   expect_future_committed(mock_journaler);
   mock_journal.commit_io_event(1U, 0);
 
@@ -954,6 +955,7 @@ TEST_F(TestMockJournal, EventAndIOCommitOrder) {
   C_SaferCond event_ctx;
   mock_journal.wait_event(2U, &event_ctx);
   on_journal_safe2->complete(0);
+  ictx->op_work_queue->drain();
   ASSERT_EQ(0, event_ctx.wait());
 }
 
@@ -1054,6 +1056,7 @@ TEST_F(TestMockJournal, IOCommitError) {
 
   // failed IO remains uncommitted in journal
   on_journal_safe->complete(0);
+  ictx->op_work_queue->drain();
   mock_journal.commit_io_event(1U, -EINVAL);
 }
 
diff --git a/src/test/librgw_file_nfsns.cc b/src/test/librgw_file_nfsns.cc
index b838cb1..7f52c80 100644
--- a/src/test/librgw_file_nfsns.cc
+++ b/src/test/librgw_file_nfsns.cc
@@ -331,6 +331,14 @@ TEST(LibRGW, SETUP_DIRS1) {
 	    ASSERT_EQ(rc, 0);
 	    sf.sync();
 	    ASSERT_TRUE(sf.rgw_fh->is_file());
+
+	    /* because we made it the hard way, fixup attributes */
+	    struct stat st;
+	    st.st_uid = owner_uid;
+	    st.st_gid = owner_gid;
+	    st.st_mode = 644;
+	    sf.rgw_fh->create_stat(&st, create_mask);
+
 	    /* open handle */
 	    rc = rgw_open(fs, sf.fh, 0 /* flags */);
 	    ASSERT_EQ(rc, 0);
@@ -523,6 +531,9 @@ TEST(LibRGW, GETATTR_DIRS1)
 	  ASSERT_TRUE(sobj.rgw_fh->is_dir());
 	  ASSERT_TRUE(S_ISDIR(st.st_mode));
 	}
+	/* validate Unix owners */
+	ASSERT_EQ(st.st_uid, owner_uid);
+	ASSERT_EQ(st.st_gid, owner_gid);
 	if (verbose) {
 	  obj_rec_st rec_st{sobj, st};
 	  std::cout << "\t"
@@ -557,6 +568,10 @@ TEST(LibRGW, READ_DIRS1)
 	  ASSERT_EQ(sobj.rgw_fh->get_size(), 16UL);
 	  // do it
 	  memset(buf, 0, 256);
+	  if (verbose) {
+	    std::cout << "reading 0,256 " << sobj.rgw_fh->relative_object_name()
+		      << std::endl;
+	  }
 	  rc = rgw_read(fs, sobj.fh, 0, 256, &nread, buf, RGW_READ_FLAG_NONE);
 	  ASSERT_EQ(rc, 0);
 	  if (verbose) {
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index 5080321..d3b4b1e 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -259,6 +259,120 @@ TEST(chain_xattr, listxattr) {
   ::unlink(file);
 }
 
+list<string> get_xattrs(int fd)
+{
+  char _buf[1024];
+  char *buf = _buf;
+  int len = sys_flistxattr(fd, _buf, sizeof(_buf));
+  if (len < 0)
+    return list<string>();
+  list<string> ret;
+  while (len > 0) {
+    size_t next_len = strlen(buf);
+    ret.push_back(string(buf, buf + next_len));
+    assert(len >= (int)(next_len + 1));
+    buf += (next_len + 1);
+    len -= (next_len + 1);
+  }
+  return ret;
+}
+
+list<string> get_xattrs(string fn)
+{
+  int fd = ::open(fn.c_str(), O_RDONLY);
+  if (fd < 0)
+    return list<string>();
+  auto ret = get_xattrs(fd);
+  ::close(fd);
+  return ret;
+}
+
+TEST(chain_xattr, fskip_chain_cleanup_and_ensure_single_attr)
+{
+  const char *name = "user.foo";
+  const char *file = FILENAME;
+  ::unlink(file);
+  int fd = ::open(file, O_CREAT|O_RDWR|O_TRUNC, 0700);
+
+  char buf[800];
+  memset(buf, sizeof(buf), 0x1F);
+  // set chunked without either
+  {
+    int r = chain_fsetxattr(fd, name, buf, sizeof(buf));
+    ASSERT_EQ(r, sizeof(buf));
+    ASSERT_GT(get_xattrs(fd).size(), 1);
+  }
+
+  // verify
+  {
+    char buf2[sizeof(buf)*2];
+    int r = chain_fgetxattr(fd, name, buf2, sizeof(buf2));
+    ASSERT_EQ(r, sizeof(buf));
+    ASSERT_EQ(memcmp(buf, buf2, sizeof(buf)), 0);
+  }
+
+  // overwrite
+  {
+    int r = chain_fsetxattr<false, true>(fd, name, buf, sizeof(buf));
+    ASSERT_EQ(r, sizeof (buf));
+    ASSERT_EQ(get_xattrs(fd).size(), 1);
+  }
+
+  // verify
+  {
+    char buf2[sizeof(buf)*2];
+    int r = chain_fgetxattr(fd, name, buf2, sizeof(buf2));
+    ASSERT_EQ(r, sizeof(buf));
+    ASSERT_EQ(memcmp(buf, buf2, sizeof(buf)), 0);
+  }
+
+  ::close(fd);
+  ::unlink(file);
+}
+
+TEST(chain_xattr, skip_chain_cleanup_and_ensure_single_attr)
+{
+  const char *name = "user.foo";
+  const char *file = FILENAME;
+  ::unlink(file);
+  int fd = ::open(file, O_CREAT|O_RDWR|O_TRUNC, 0700);
+  ::close(fd);
+
+  char buf[3000];
+  memset(buf, sizeof(buf), 0x1F);
+  // set chunked without either
+  {
+    int r = chain_setxattr(file, name, buf, sizeof(buf));
+    ASSERT_EQ(r, sizeof(buf));
+    ASSERT_GT(get_xattrs(file).size(), 1);
+  }
+
+  // verify
+  {
+    char buf2[sizeof(buf)*2];
+    int r = chain_getxattr(file, name, buf2, sizeof(buf2));
+    ASSERT_EQ(r, sizeof(buf));
+    ASSERT_EQ(memcmp(buf, buf2, sizeof(buf)), 0);
+  }
+
+  // overwrite
+  {
+    int r = chain_setxattr<false, true>(file, name, buf, sizeof(buf));
+    ASSERT_EQ(r, sizeof (buf));
+    ASSERT_EQ(get_xattrs(file).size(), 1);
+  }
+
+  // verify
+  {
+    char buf2[sizeof(buf)*2];
+    int r = chain_getxattr(file, name, buf2, sizeof(buf2));
+    ASSERT_EQ(r, sizeof(buf));
+    ASSERT_EQ(memcmp(buf, buf2, sizeof(buf)), 0);
+  }
+
+  ::unlink(file);
+}
+
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index b1f9013..41bcf99 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -38,8 +38,8 @@ TEST(BlueFS, mkfs) {
   string fn = get_temp_bdev(size);
   uuid_d fsid;
   BlueFS fs;
-  fs.add_block_device(0, fn);
-  fs.add_block_extent(0, 1048576, size - 1048576);
+  fs.add_block_device(BlueFS::BDEV_DB, fn);
+  fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   fs.mkfs(fsid);
   rm_temp_bdev(fn);
 }
@@ -48,13 +48,13 @@ TEST(BlueFS, mkfs_mount) {
   uint64_t size = 1048476 * 128;
   string fn = get_temp_bdev(size);
   BlueFS fs;
-  ASSERT_EQ(0, fs.add_block_device(0, fn));
-  fs.add_block_extent(0, 1048576, size - 1048576);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
+  fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
   ASSERT_EQ(0, fs.mkfs(fsid));
   ASSERT_EQ(0, fs.mount());
-  ASSERT_EQ(fs.get_total(0), size - 1048576);
-  ASSERT_LT(fs.get_free(0), size - 1048576);
+  ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576);
+  ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576);
   fs.umount();
   rm_temp_bdev(fn);
 }
@@ -63,8 +63,8 @@ TEST(BlueFS, write_read) {
   uint64_t size = 1048476 * 128;
   string fn = get_temp_bdev(size);
   BlueFS fs;
-  ASSERT_EQ(0, fs.add_block_device(0, fn));
-  fs.add_block_extent(0, 1048576, size - 1048576);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
+  fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
   ASSERT_EQ(0, fs.mkfs(fsid));
   ASSERT_EQ(0, fs.mount());
@@ -99,8 +99,8 @@ TEST(BlueFS, small_appends) {
   uint64_t size = 1048476 * 128;
   string fn = get_temp_bdev(size);
   BlueFS fs;
-  ASSERT_EQ(0, fs.add_block_device(0, fn));
-  fs.add_block_extent(0, 1048576, size - 1048576);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
+  fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
   ASSERT_EQ(0, fs.mkfs(fsid));
   ASSERT_EQ(0, fs.mount());
diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc
index ad4cb75..1ff2e4d 100644
--- a/src/test/os/TestLFNIndex.cc
+++ b/src/test/os/TestLFNIndex.cc
@@ -49,7 +49,7 @@ public:
     const std::string mangled_name = lfn_generate_object_name(hoid);
     EXPECT_EQ(mangled_expected, mangled_name);
     ghobject_t hoid_parsed;
-    EXPECT_TRUE(lfn_parse_object_name(mangled_name, &hoid_parsed));
+    EXPECT_EQ(0, lfn_parse_object_name(mangled_name, &hoid_parsed));
     EXPECT_EQ(hoid, hoid_parsed);
   }
 
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index a649731..4bf6c27 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -491,6 +491,11 @@ class TestFS(TestArgparse):
         self.assert_valid_command(['fs', 'ls'])
         assert_equal({}, validate_command(sigdict, ['fs', 'ls', 'toomany']))
 
+    def test_fs_set_default(self):
+        self.assert_valid_command(['fs', 'set_default', 'cephfs'])
+        assert_equal({}, validate_command(sigdict, ['fs', 'set_default']))
+        assert_equal({}, validate_command(sigdict, ['fs', 'set_default', 'cephfs', 'toomany']))
+
 class TestMon(TestArgparse):
 
     def test_dump(self):
diff --git a/src/test/rbd_mirror/test_ClusterWatcher.cc b/src/test/rbd_mirror/test_ClusterWatcher.cc
index 1629a16..2d7d3f2 100644
--- a/src/test/rbd_mirror/test_ClusterWatcher.cc
+++ b/src/test/rbd_mirror/test_ClusterWatcher.cc
@@ -45,7 +45,8 @@ public:
   void create_pool(bool enable_mirroring, const peer_t &peer,
                    string *uuid = nullptr, string *name=nullptr) {
     string pool_name = get_temp_pool_name("test-rbd-mirror-");
-    ASSERT_EQ("", create_one_pool_pp(pool_name, *m_cluster));
+    ASSERT_EQ(0, m_cluster->pool_create(pool_name.c_str()));
+
     int64_t pool_id = m_cluster->pool_lookup(pool_name.c_str());
     ASSERT_GE(pool_id, 0);
     m_pools.insert(pool_name);
@@ -84,7 +85,8 @@ public:
   void create_cache_pool(const string &base_pool, string *cache_pool_name) {
     bufferlist inbl;
     *cache_pool_name = get_temp_pool_name("test-rbd-mirror-");
-    ASSERT_EQ("", create_one_pool_pp(*cache_pool_name, *m_cluster));
+    ASSERT_EQ(0, m_cluster->pool_create(cache_pool_name->c_str()));
+
     ASSERT_EQ(0, m_cluster->mon_command(
       "{\"prefix\": \"osd tier add\", \"pool\": \"" + base_pool +
       "\", \"tierpool\": \"" + *cache_pool_name +
diff --git a/src/test/rbd_mirror/test_ImageReplayer.cc b/src/test/rbd_mirror/test_ImageReplayer.cc
index 18f1441..84e4afc 100644
--- a/src/test/rbd_mirror/test_ImageReplayer.cc
+++ b/src/test/rbd_mirror/test_ImageReplayer.cc
@@ -76,14 +76,14 @@ public:
     EXPECT_EQ("", connect_cluster_pp(m_local_cluster));
 
     m_local_pool_name = get_temp_pool_name();
-    EXPECT_EQ("", create_one_pool_pp(m_local_pool_name, m_local_cluster));
+    EXPECT_EQ(0, m_local_cluster.pool_create(m_local_pool_name.c_str()));
     EXPECT_EQ(0, m_local_cluster.ioctx_create(m_local_pool_name.c_str(),
 					      m_local_ioctx));
 
     EXPECT_EQ("", connect_cluster_pp(m_remote_cluster));
 
     m_remote_pool_name = get_temp_pool_name();
-    EXPECT_EQ("", create_one_pool_pp(m_remote_pool_name, m_remote_cluster));
+    EXPECT_EQ(0, m_remote_cluster.pool_create(m_remote_pool_name.c_str()));
     m_remote_pool_id = m_remote_cluster.pool_lookup(m_remote_pool_name.c_str());
     EXPECT_GE(m_remote_pool_id, 0);
 
@@ -104,6 +104,13 @@ public:
 
   ~TestImageReplayer()
   {
+    if (m_watch_handle != 0) {
+      m_remote_ioctx.unwatch2(m_watch_handle);
+      delete m_watch_ctx;
+      m_watch_ctx = nullptr;
+      m_watch_handle = 0;
+    }
+
     delete m_replayer;
     delete m_threads;
 
diff --git a/src/test/rbd_mirror/test_ImageSync.cc b/src/test/rbd_mirror/test_ImageSync.cc
index e9b234e..5e6b0a9 100644
--- a/src/test/rbd_mirror/test_ImageSync.cc
+++ b/src/test/rbd_mirror/test_ImageSync.cc
@@ -58,6 +58,11 @@ public:
     ASSERT_EQ(0, m_remote_journaler->register_client(client_data_bl));
   }
 
+  virtual void TearDown() {
+    TestFixture::TearDown();
+    delete m_remote_journaler;
+  }
+
   void create_and_open(librados::IoCtx &io_ctx, librbd::ImageCtx **image_ctx) {
     librbd::RBD rbd;
     ASSERT_EQ(0, create_image(rbd, io_ctx, m_image_name, m_image_size));
diff --git a/src/test/rbd_mirror/test_PoolWatcher.cc b/src/test/rbd_mirror/test_PoolWatcher.cc
index b8ff311..5d131d3 100644
--- a/src/test/rbd_mirror/test_PoolWatcher.cc
+++ b/src/test/rbd_mirror/test_PoolWatcher.cc
@@ -54,7 +54,8 @@ TestPoolWatcher() : m_lock("TestPoolWatcherLock"),
 
   void create_pool(bool enable_mirroring, const peer_t &peer, string *name=nullptr) {
     string pool_name = get_temp_pool_name("test-rbd-mirror-");
-    ASSERT_EQ("", create_one_pool_pp(pool_name, *m_cluster));
+    ASSERT_EQ(0, m_cluster->pool_create(pool_name.c_str()));
+
     int64_t pool_id = m_cluster->pool_lookup(pool_name.c_str());
     ASSERT_GE(pool_id, 0);
     m_pools.insert(pool_name);
@@ -83,7 +84,8 @@ TestPoolWatcher() : m_lock("TestPoolWatcherLock"),
   void create_cache_pool(const string &base_pool, string *cache_pool_name) {
     bufferlist inbl;
     *cache_pool_name = get_temp_pool_name("test-rbd-mirror-");
-    ASSERT_EQ("", create_one_pool_pp(*cache_pool_name, *m_cluster));
+    ASSERT_EQ(0, m_cluster->pool_create(cache_pool_name->c_str()));
+
     ASSERT_EQ(0, m_cluster->mon_command(
       "{\"prefix\": \"osd tier add\", \"pool\": \"" + base_pool +
       "\", \"tierpool\": \"" + *cache_pool_name +
diff --git a/src/test/rbd_mirror/test_fixture.cc b/src/test/rbd_mirror/test_fixture.cc
index 56620db..34981ea 100644
--- a/src/test/rbd_mirror/test_fixture.cc
+++ b/src/test/rbd_mirror/test_fixture.cc
@@ -22,11 +22,13 @@ TestFixture::TestFixture() {
 }
 
 void TestFixture::SetUpTestCase() {
+  ASSERT_EQ("", connect_cluster_pp(_rados));
+
   _local_pool_name = get_temp_pool_name("test-rbd-mirror-");
-  ASSERT_EQ("", create_one_pool_pp(_local_pool_name, _rados));
+  ASSERT_EQ(0, _rados.pool_create(_local_pool_name.c_str()));
 
   _remote_pool_name = get_temp_pool_name("test-rbd-mirror-");
-  ASSERT_EQ("", create_one_pool_pp(_remote_pool_name, _rados));
+  ASSERT_EQ(0, _rados.pool_create(_remote_pool_name.c_str()));
 }
 
 void TestFixture::TearDownTestCase() {
diff --git a/src/tools/Makefile-client.am b/src/tools/Makefile-client.am
index 509cba3..0b81549 100644
--- a/src/tools/Makefile-client.am
+++ b/src/tools/Makefile-client.am
@@ -71,6 +71,7 @@ noinst_HEADERS += \
 	tools/rbd/Utils.h
 rbd_LDADD = \
 	libjournal.la libcls_journal_client.la \
+	libcls_rbd_client.la libcls_lock_client.la \
 	$(LIBRBD) $(LIBRBD_TYPES) $(LIBRADOS) $(CEPH_GLOBAL) \
 	$(BOOST_REGEX_LIBS) $(BOOST_PROGRAM_OPTIONS_LIBS)
 if LINUX
diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc
index 469c9f3..a213391 100644
--- a/src/tools/ceph_authtool.cc
+++ b/src/tools/ceph_authtool.cc
@@ -37,13 +37,14 @@ void usage()
        << "                                specified entityname\n"
        << "  --gen-print-key               will generate a new secret key without set it\n"
        << "                                to the keyringfile, prints the secret to stdout\n"
-       << "  --import-keyring              will import the content of a given keyring\n"
+       << "  --import-keyring FILE         will import the content of a given keyring\n"
        << "                                into the keyringfile\n"
-       << "  -u, --set-uid                 sets the auid (authenticated user id) for the\n"
+       << "  -n NAME, --name NAME          specify entityname to operate on\n"
+       << "  -u AUID, --set-uid AUID       sets the auid (authenticated user id) for the\n"
        << "                                specified entityname\n"
-       << "  -a, --add-key                 will add an encoded key to the keyring\n"
-       << "  --cap subsystem capability    will set the capability for given subsystem\n"
-       << "  --caps capsfile               will set all of capabilities associated with a\n"
+       << "  -a BASE64, --add-key BASE64   will add an encoded key to the keyring\n"
+       << "  --cap SUBSYSTEM CAPABILITY    will set the capability for given subsystem\n"
+       << "  --caps CAPSFILE               will set all of capabilities associated with a\n"
        << "                                given key, for all subsystems"
        << std::endl;
   exit(1);
@@ -73,6 +74,9 @@ int main(int argc, const char **argv)
   bool set_auid = false;
   std::vector<const char*>::iterator i;
 
+  /* Handle options unique to ceph-authtool
+   * -n NAME, --name NAME is handled by global_init
+   * */
   for (i = args.begin(); i != args.end(); ) {
     std::string val;
     if (ceph_argparse_double_dash(args, i)) {
@@ -117,6 +121,7 @@ int main(int argc, const char **argv)
       usage();
     }
   }
+
   if (fn.empty() && !gen_print_key) {
     cerr << argv[0] << ": must specify filename" << std::endl;
     usage();
diff --git a/src/tools/rbd/action/Journal.cc b/src/tools/rbd/action/Journal.cc
index 2995684..e82265c 100644
--- a/src/tools/rbd/action/Journal.cc
+++ b/src/tools/rbd/action/Journal.cc
@@ -13,14 +13,13 @@
 #include <fstream>
 #include <sstream>
 #include <boost/program_options.hpp>
-
+#include "cls/rbd/cls_rbd_client.h"
 #include "cls/journal/cls_journal_types.h"
 #include "cls/journal/cls_journal_client.h"
 
 #include "journal/Journaler.h"
 #include "journal/ReplayEntry.h"
 #include "journal/ReplayHandler.h"
-//#include "librbd/Journal.h" // XXXMG: for librbd::Journal::reset()
 #include "librbd/journal/Types.h"
 
 namespace rbd {
@@ -134,40 +133,35 @@ static int do_show_journal_status(librados::IoCtx& io_ctx,
 static int do_reset_journal(librados::IoCtx& io_ctx,
 			    const std::string& journal_id)
 {
-  // XXXMG: does not work due to a linking issue
-  //return librbd::Journal::reset(io_ctx, journal_id);
-
-  ::journal::Journaler journaler(io_ctx, journal_id, "", 5);
-
-  C_SaferCond cond;
-  journaler.init(&cond);
-
-  int r = cond.wait();
+  // disable/re-enable journaling to delete/re-create the journal
+  // to properly handle mirroring constraints
+  std::string image_name;
+  int r = librbd::cls_client::dir_get_name(&io_ctx, RBD_DIRECTORY, journal_id,
+                                           &image_name);
   if (r < 0) {
-    std::cerr << "failed to initialize journal: " << cpp_strerror(r)
-	      << std::endl;
+    std::cerr << "failed to locate journal's image: " << cpp_strerror(r)
+              << std::endl;
     return r;
   }
 
-  uint8_t order, splay_width;
-  int64_t pool_id;
-  journaler.get_metadata(&order, &splay_width, &pool_id);
-
-  r = journaler.remove(true);
+  librbd::Image image;
+  r = utils::open_image(io_ctx, image_name, false, &image);
   if (r < 0) {
-    std::cerr << "failed to reset journal: " << cpp_strerror(r) << std::endl;
+    std::cerr << "failed to open image: " << cpp_strerror(r) << std::endl;
     return r;
   }
-  r = journaler.create(order, splay_width, pool_id);
+
+  r = image.update_features(RBD_FEATURE_JOURNALING, false);
   if (r < 0) {
-    std::cerr << "failed to create journal: " << cpp_strerror(r) << std::endl;
+    std::cerr << "failed to disable image journaling: " << cpp_strerror(r)
+              << std::endl;
     return r;
   }
 
-  // TODO register with librbd payload
-  r = journaler.register_client(bufferlist());
+  r = image.update_features(RBD_FEATURE_JOURNALING, true);
   if (r < 0) {
-    std::cerr << "failed to register client: " << cpp_strerror(r) << std::endl;
+    std::cerr << "failed to re-enable image journaling: " << cpp_strerror(r)
+              << std::endl;
     return r;
   }
   return 0;
diff --git a/src/tools/rbd_mirror/Replayer.cc b/src/tools/rbd_mirror/Replayer.cc
index ce76f99..d73bc10 100644
--- a/src/tools/rbd_mirror/Replayer.cc
+++ b/src/tools/rbd_mirror/Replayer.cc
@@ -5,6 +5,9 @@
 
 #include "common/Formatter.h"
 #include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/code_environment.h"
+#include "common/common_init.h"
 #include "common/debug.h"
 #include "common/errno.h"
 #include "include/stringify.h"
@@ -143,37 +146,60 @@ int Replayer::init()
 {
   dout(20) << "replaying for " << m_peer << dendl;
 
-  int r = m_remote->init2(m_peer.client_name.c_str(),
-			  m_peer.cluster_name.c_str(), 0);
-  if (r < 0) {
-    derr << "error initializing remote cluster handle for " << m_peer
-	 << " : " << cpp_strerror(r) << dendl;
-    return r;
+  // NOTE: manually bootstrap a CephContext here instead of via
+  // the librados API to avoid mixing global singletons between
+  // the librados shared library and the daemon
+  // TODO: eliminate intermingling of global singletons within Ceph APIs
+  CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
+  if (m_peer.client_name.empty() ||
+      !iparams.name.from_str(m_peer.client_name)) {
+    derr << "error initializing remote cluster handle for " << m_peer << dendl;
+    return -EINVAL;
   }
 
-  r = m_remote->conf_read_file(nullptr);
+  CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY,
+                                    CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+  cct->_conf->cluster = m_peer.cluster_name;
+
+  // librados::Rados::conf_read_file
+  int r = cct->_conf->parse_config_files(nullptr, nullptr, 0);
   if (r < 0) {
-    derr << "could not read ceph conf for " << m_peer
-	 << " : " << cpp_strerror(r) << dendl;
+    derr << "could not read ceph conf for " << m_peer << ": "
+	 << cpp_strerror(r) << dendl;
+    cct->put();
     return r;
   }
+  cct->_conf->parse_env();
 
-  r = m_remote->conf_parse_env(nullptr);
+  // librados::Rados::conf_parse_env
+  std::vector<const char*> args;
+  env_to_vec(args, nullptr);
+  r = cct->_conf->parse_argv(args);
   if (r < 0) {
-    derr << "could not parse environment for " << m_peer
-	 << " : " << cpp_strerror(r) << dendl;
+    derr << "could not parse environment for " << m_peer << ":"
+         << cpp_strerror(r) << dendl;
+    cct->put();
     return r;
   }
 
   if (!m_args.empty()) {
-    r = m_remote->conf_parse_argv(m_args.size(), &m_args[0]);
+    // librados::Rados::conf_parse_argv
+    r = cct->_conf->parse_argv(m_args);
     if (r < 0) {
-      derr << "could not parse command line args for " << m_peer
-	   << " : " << cpp_strerror(r) << dendl;
+      derr << "could not parse command line args for " << m_peer << ": "
+	   << cpp_strerror(r) << dendl;
+      cct->put();
       return r;
     }
   }
 
+  cct->_conf->apply_changes(nullptr);
+  cct->_conf->complain_about_parse_errors(cct);
+
+  r = m_remote->init_with_context(cct);
+  assert(r == 0);
+  cct->put();
+
   r = m_remote->connect();
   if (r < 0) {
     derr << "error connecting to remote cluster " << m_peer
diff --git a/src/vstart.sh b/src/vstart.sh
index c81a793..9d225ef 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -625,7 +625,7 @@ fi
 
 if [ "$start_mds" -eq 1 -a "$CEPH_NUM_MDS" -gt 0 ]; then
     if [ "$CEPH_NUM_FS" -gt "1" ] ; then
-        $CEPH_ADM fs flag set enable_multiple true
+        $CEPH_ADM fs flag set enable_multiple true --yes-i-really-mean-it
     fi
 
     fs=0
diff --git a/systemd/ceph-mds at .service b/systemd/ceph-mds at .service
index e122580..f13cef4 100644
--- a/systemd/ceph-mds at .service
+++ b/systemd/ceph-mds at .service
@@ -15,6 +15,7 @@ PrivateDevices=yes
 ProtectHome=true
 ProtectSystem=full
 PrivateTmp=true
+TasksMax=infinity
 
 [Install]
 WantedBy=ceph-mds.target
diff --git a/systemd/ceph-mon at .service b/systemd/ceph-mon at .service
index a8d427b..b9501d6 100644
--- a/systemd/ceph-mon at .service
+++ b/systemd/ceph-mon at .service
@@ -21,6 +21,7 @@ PrivateDevices=yes
 ProtectHome=true
 ProtectSystem=full
 PrivateTmp=true
+TasksMax=infinity
 
 [Install]
 WantedBy=ceph-mon.target
diff --git a/systemd/ceph-osd at .service b/systemd/ceph-osd at .service
index 0d73afb..1778db7 100644
--- a/systemd/ceph-osd at .service
+++ b/systemd/ceph-osd at .service
@@ -15,6 +15,7 @@ ExecReload=/bin/kill -HUP $MAINPID
 ProtectHome=true
 ProtectSystem=full
 PrivateTmp=true
+TasksMax=infinity
 
 [Install]
 WantedBy=ceph-osd.target
diff --git a/systemd/ceph-radosgw at .service b/systemd/ceph-radosgw at .service
index 66d9eb8..cfa5788 100644
--- a/systemd/ceph-radosgw at .service
+++ b/systemd/ceph-radosgw at .service
@@ -14,6 +14,7 @@ PrivateDevices=yes
 ProtectHome=true
 ProtectSystem=full
 PrivateTmp=true
+TasksMax=infinity
 
 [Install]
 WantedBy=ceph-radosgw.target
diff --git a/systemd/ceph-rbd-mirror at .service b/systemd/ceph-rbd-mirror at .service
index 4c2e2f0..d38aec5 100644
--- a/systemd/ceph-rbd-mirror at .service
+++ b/systemd/ceph-rbd-mirror at .service
@@ -17,6 +17,7 @@ PrivateTmp=true
 Restart=on-failure
 StartLimitInterval=30min
 StartLimitBurst=3
+TasksMax=infinity
 
 [Install]
 WantedBy=ceph-rbd-mirror.target

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list