[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.80.11

Gaudenz Steinlin gaudenz at moszumanska.debian.org
Mon Dec 26 20:48:02 UTC 2016


This is an automated email from the git hooks/post-receive script.

gaudenz pushed a commit to annotated tag upstream/0.80.11
in repository ceph.

commit a829670e143162f323f12bd013f1331406831dd0
Author: Gaudenz Steinlin <gaudenz at debian.org>
Date:   Thu Jan 14 22:06:05 2016 +0100

    Imported Upstream version 0.80.11
---
 Makefile.am                                   |   6 +-
 Makefile.in                                   |   6 +-
 ceph.spec                                     |  46 +++---
 ceph.spec.in                                  |  44 +++---
 configure                                     |  22 +--
 configure.ac                                  |   2 +-
 src/.git_version                              |   4 +-
 src/Makefile.in                               |   2 +-
 src/ceph-disk                                 | 106 ++++++++-----
 src/ceph-post-file.in                         |  20 +--
 src/ceph_fuse.cc                              |   4 +-
 src/ceph_syn.cc                               |   5 +-
 src/common/Mutex.cc                           |   1 +
 src/common/RWLock.h                           |   3 +-
 src/common/Thread.cc                          |  12 +-
 src/common/Throttle.cc                        |   6 +
 src/common/Throttle.h                         |   3 +-
 src/common/WorkQueue.h                        |  35 ++++-
 src/common/admin_socket.cc                    |  50 ++++--
 src/common/admin_socket.h                     |   1 +
 src/common/buffer.cc                          |  23 ++-
 src/common/ceph_context.cc                    |   6 +
 src/common/ceph_context.h                     |  19 +++
 src/common/config.cc                          |   2 +-
 src/common/config_opts.h                      |  17 +-
 src/common/sync_filesystem.h                  |  12 +-
 src/crush/CrushWrapper.cc                     |  27 ++++
 src/crush/CrushWrapper.h                      |   1 +
 src/include/interval_set.h                    |   1 +
 src/init-radosgw                              |  16 +-
 src/init-radosgw.sysv                         |  17 +-
 src/json_spirit/json_spirit_reader_template.h |  34 +++-
 src/libcephfs.cc                              |  17 +-
 src/librados/IoCtxImpl.cc                     |   4 +-
 src/librados/RadosClient.cc                   |   8 +-
 src/librados/RadosClient.h                    |   4 +-
 src/librbd/AioCompletion.cc                   |  48 +++++-
 src/librbd/AioCompletion.h                    |  33 +---
 src/librbd/AioRequest.cc                      |  15 +-
 src/librbd/AioRequest.h                       |   6 +-
 src/librbd/ImageCtx.cc                        |  34 +++-
 src/librbd/ImageCtx.h                         |   3 +
 src/librbd/internal.cc                        | 183 +++++++++-------------
 src/librbd/internal.h                         |  17 +-
 src/librbd/librbd.cc                          | 139 +++++++++++++++--
 src/log/Log.cc                                |   2 +
 src/mds/MDSUtility.cc                         |   2 +-
 src/messages/MOSDBoot.h                       |  20 ++-
 src/mon/AuthMonitor.cc                        |   3 +-
 src/mon/Elector.cc                            |   9 +-
 src/mon/MDSMonitor.cc                         |  47 +++---
 src/mon/MDSMonitor.h                          |   6 +-
 src/mon/MonClient.cc                          |  12 +-
 src/mon/MonCommands.h                         |   4 +-
 src/mon/Monitor.cc                            |  69 ++++-----
 src/mon/MonitorDBStore.h                      |  19 ++-
 src/mon/MonitorStore.cc                       |  16 +-
 src/mon/OSDMonitor.cc                         |  70 ++++++++-
 src/mon/OSDMonitor.h                          |  10 +-
 src/mon/PGMonitor.cc                          |   8 +-
 src/mon/PaxosService.cc                       |  10 ++
 src/mon/PaxosService.h                        |   5 +-
 src/msg/Messenger.cc                          |   7 +
 src/msg/Messenger.h                           |  15 ++
 src/os/FileJournal.cc                         |  13 +-
 src/os/FileStore.cc                           |   6 +-
 src/os/WBThrottle.cc                          |   1 +
 src/os/chain_xattr.cc                         |   8 +
 src/osd/ECBackend.cc                          |   2 +-
 src/osd/OSD.cc                                |  61 ++++++--
 src/osd/OSD.h                                 |  36 ++---
 src/osd/OSDMap.cc                             |  14 +-
 src/osd/PG.cc                                 |   6 +-
 src/osd/PGLog.cc                              |  65 +++++---
 src/osd/PGLog.h                               |  19 ++-
 src/osd/ReplicatedPG.cc                       | 100 ++++++++++--
 src/osd/ReplicatedPG.h                        |   6 +-
 src/osd/osd_types.cc                          | 153 ++++++++++++++++--
 src/osd/osd_types.h                           |  44 +++++-
 src/osdc/Objecter.cc                          |  92 +++++++++--
 src/osdc/Objecter.h                           |  80 ++++++++--
 src/rgw/logrotate.conf                        |   2 +-
 src/rgw/rgw_civetweb.cc                       |  11 +-
 src/rgw/rgw_client_io.cc                      |   7 +-
 src/rgw/rgw_main.cc                           |   2 +-
 src/rgw/rgw_op.cc                             |  13 ++
 src/rgw/rgw_rados.cc                          |  35 +++++
 src/rgw/rgw_rados.h                           |  11 ++
 src/rgw/rgw_rest.cc                           |  34 +++-
 src/rgw/rgw_rest.h                            |   8 +-
 src/rgw/rgw_rest_swift.cc                     |  43 ++++--
 src/rgw/rgw_swift.cc                          |   5 +
 src/test/Makefile.am                          |   2 +-
 src/test/bufferlist.cc                        |  11 ++
 src/test/librados/TestCase.cc                 |   9 +-
 src/test/librados/tier.cc                     | 215 +++++++++++++++++++++++++-
 src/test/librbd/test_librbd.cc                |  71 ++++++++-
 src/test/mon/test_mon_workloadgen.cc          |   3 +-
 src/test/objectstore/chain_xattr.cc           |  39 +++++
 src/test/osd/TestPGLog.cc                     |  84 +++++++---
 src/tools/ceph_authtool.cc                    |   2 +-
 src/upstart/ceph-mds.conf                     |   2 +-
 src/upstart/ceph-mon.conf                     |   2 +-
 src/upstart/ceph-osd.conf                     |   2 +-
 104 files changed, 2053 insertions(+), 664 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index cba3af2..7b6960c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -20,9 +20,9 @@ EXTRA_DIST += \
 # why is it so hard to make autotools to this?
 install-data-local:
 	-mkdir -p $(DESTDIR)$(datadir)/ceph
-	-install -m 644 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
-	-install -m 644 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
-	-install -m 644 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
+	-install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
+	-install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
+	-install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
 
 all-local:
 if WITH_DEBUG
diff --git a/Makefile.in b/Makefile.in
index a1095c0..969724b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -864,9 +864,9 @@ uninstall-am:
 # why is it so hard to make autotools to this?
 install-data-local:
 	-mkdir -p $(DESTDIR)$(datadir)/ceph
-	-install -m 644 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
-	-install -m 644 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
-	-install -m 644 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
+	-install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
+	-install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
+	-install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
 
 all-local:
 #	We need gtest to build the rados-api tests. We only build those in
diff --git a/ceph.spec b/ceph.spec
index 73fe7a8..772cc8a 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -5,11 +5,14 @@
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
 #################################################################################
 # common
 #################################################################################
+
 Name:		ceph
-Version:	0.80.10
+Version:	0.80.11
 Release:	0%{?dist}
 Summary:	User space components of the Ceph file system
 License:	GPL-2.0
@@ -24,7 +27,6 @@ Requires:	librados2 = %{version}-%{release}
 Requires:	libcephfs1 = %{version}-%{release}
 Requires:	ceph-common = %{version}-%{release}
 Requires:	python
-Requires:	python-argparse
 Requires:	python-ceph
 Requires:	python-requests
 Requires:	python-flask
@@ -44,7 +46,6 @@ BuildRequires:	gdbm
 BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-nose
-BuildRequires:	python-argparse
 BuildRequires:	libaio-devel
 BuildRequires:	libcurl-devel
 BuildRequires:	libxml2-devel
@@ -77,7 +78,6 @@ BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
-BuildRequires:	fdupes
 %else
 Requires:	gdisk
 BuildRequires:	nss-devel
@@ -107,6 +107,11 @@ Requires:	librados2 = %{version}-%{release}
 Requires:	python-ceph = %{version}-%{release}
 Requires:	python-requests
 Requires:	redhat-lsb-core
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+BuildRequires:	python-argparse
+%endif
 %description -n ceph-common
 common utilities to mount and interact with a ceph storage cluster
 
@@ -219,9 +224,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{version}-%{release}
 Requires:	librbd1 = %{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
 %description -n python-ceph
 This package contains Python libraries for interacting with Cephs RADOS
 object storage.
@@ -262,8 +264,13 @@ License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{version}-%{release}
 BuildRequires:	java-devel
+%if 0%{?el6}
 Requires:	junit4
 BuildRequires:	junit4
+%else
+Requires:       junit
+BuildRequires:  junit
+%endif
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
@@ -352,13 +359,8 @@ chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
 # udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 
 %if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -385,12 +387,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
 
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
 %clean
 rm -rf $RPM_BUILD_ROOT
 
@@ -477,13 +473,8 @@ fi
 %{_libdir}/ceph/erasure-code/libec_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_test_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_missing_entry_point.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
@@ -543,6 +534,7 @@ fi
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
+%{_udevrulesdir}/50-rbd.rules
 
 %postun -n ceph-common
 # Package removal cleanup
diff --git a/ceph.spec.in b/ceph.spec.in
index b1372aa..67a3825 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -5,9 +5,12 @@
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
 #################################################################################
 # common
 #################################################################################
+
 Name:		ceph
 Version:	@VERSION@
 Release:	@RPM_RELEASE@%{?dist}
@@ -24,7 +27,6 @@ Requires:	librados2 = %{version}-%{release}
 Requires:	libcephfs1 = %{version}-%{release}
 Requires:	ceph-common = %{version}-%{release}
 Requires:	python
-Requires:	python-argparse
 Requires:	python-ceph
 Requires:	python-requests
 Requires:	python-flask
@@ -44,7 +46,6 @@ BuildRequires:	gdbm
 BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-nose
-BuildRequires:	python-argparse
 BuildRequires:	libaio-devel
 BuildRequires:	libcurl-devel
 BuildRequires:	libxml2-devel
@@ -77,7 +78,6 @@ BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
-BuildRequires:	fdupes
 %else
 Requires:	gdisk
 BuildRequires:	nss-devel
@@ -107,6 +107,11 @@ Requires:	librados2 = %{version}-%{release}
 Requires:	python-ceph = %{version}-%{release}
 Requires:	python-requests
 Requires:	redhat-lsb-core
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+BuildRequires:	python-argparse
+%endif
 %description -n ceph-common
 common utilities to mount and interact with a ceph storage cluster
 
@@ -219,9 +224,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{version}-%{release}
 Requires:	librbd1 = %{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
 %description -n python-ceph
 This package contains Python libraries for interacting with Cephs RADOS
 object storage.
@@ -262,8 +264,13 @@ License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{version}-%{release}
 BuildRequires:	java-devel
+%if 0%{?el6}
 Requires:	junit4
 BuildRequires:	junit4
+%else
+Requires:       junit
+BuildRequires:  junit
+%endif
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
@@ -352,13 +359,8 @@ chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
 # udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 
 %if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -385,12 +387,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
 
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
 %clean
 rm -rf $RPM_BUILD_ROOT
 
@@ -477,13 +473,8 @@ fi
 %{_libdir}/ceph/erasure-code/libec_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_test_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_missing_entry_point.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
@@ -543,6 +534,7 @@ fi
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
+%{_udevrulesdir}/50-rbd.rules
 
 %postun -n ceph-common
 # Package removal cleanup
diff --git a/configure b/configure
index 18f6513..b39fb90 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 0.80.10.
+# Generated by GNU Autoconf 2.69 for ceph 0.80.11.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.80.10'
-PACKAGE_STRING='ceph 0.80.10'
+PACKAGE_VERSION='0.80.11'
+PACKAGE_STRING='ceph 0.80.11'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1459,7 +1459,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.80.10 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80.11 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1530,7 +1530,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.80.10:";;
+     short | recursive ) echo "Configuration of ceph 0.80.11:";;
    esac
   cat <<\_ACEOF
 
@@ -1677,7 +1677,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.80.10
+ceph configure 0.80.11
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2707,7 +2707,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.80.10, which was
+It was created by ceph $as_me 0.80.11, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -4824,7 +4824,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80.10'
+ VERSION='0.80.11'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12728,7 +12728,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80.10'
+ VERSION='0.80.11'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -22601,7 +22601,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.80.10, which was
+This file was extended by ceph $as_me 0.80.11, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22667,7 +22667,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.80.10
+ceph config.status 0.80.11
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index f667953..4b0bec2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.80.10], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80.11], [ceph-devel at vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/src/.git_version b/src/.git_version
index d488cda..654f55e 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-ea6c958c38df1216bf95c927f143d8b13c4a9e70
-v0.80.10
+8424145d49264624a3b0a204aedb127835161070
+v0.80.11
diff --git a/src/Makefile.in b/src/Makefile.in
index ca219b8..1c86a61 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -5280,7 +5280,7 @@ ceph_test_librbd_LDADD = $(LIBRBD) $(LIBRADOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @LINUX_TRUE at ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c
 @LINUX_TRUE at ceph_test_librbd_fsx_LDADD = $(LIBRBD) $(LIBRADOS) -lm
- at LINUX_TRUE@ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format
+ at LINUX_TRUE@ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS}
 ceph_test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc
 ceph_test_cls_rbd_LDADD = $(LIBRADOS) libcls_rbd_client.la libcls_lock_client.la $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
 ceph_test_cls_rbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
diff --git a/src/ceph-disk b/src/ceph-disk
index 6bd0220..c5e7af6 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -919,6 +919,9 @@ def unmount(
 
 ###########################################
 
+def extract_parted_partition_numbers(partitions):
+    numbers_as_strings = re.findall('^\d+', partitions, re.MULTILINE)
+    return map(int, numbers_as_strings)
 
 def get_free_partition_index(dev):
     """
@@ -943,31 +946,19 @@ def get_free_partition_index(dev):
 
     if not lines:
         raise Error('parted failed to output anything')
-    lines = str(lines).splitlines(True)
-
-    # work around buggy libreadline(?) library in rhel/centos.
-    idiot_prefix = '\x1b\x5b\x3f\x31\x30\x33\x34\x68'
-    if lines[0].startswith(idiot_prefix):
-        lines[0] = lines[0][8:]
-
-    if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
-        raise Error('weird parted units', lines[0])
-    del lines[0]
-
-    if not lines[0].startswith('/dev/'):
-        raise Error('weird parted disk entry', lines[0])
-    del lines[0]
-
-    seen = set()
-    for line in lines:
-        idx, _ = line.split(':', 1)
-        idx = int(idx)
-        seen.add(idx)
-
-    num = 1
-    while num in seen:
-        num += 1
-    return num
+    if ('CHS;' not in lines and
+        'CYL;' not in lines and
+        'BYT;' not in lines):
+        raise Error('parted output expected to contain one of ' +
+                    'CHH; CYL; or BYT; : ' + lines)
+    if dev not in lines:
+        raise Error('parted output expected to contain ' + dev + ': ' + lines)
+    _, partitions = lines.split(dev)
+    partition_numbers = extract_parted_partition_numbers(partitions)
+    if partition_numbers:
+        return max(partition_numbers) + 1
+    else:
+        return 1
 
 
 def update_partition(action, dev, description):
@@ -1018,6 +1009,13 @@ def zap(dev):
             [
                 'sgdisk',
                 '--zap-all',
+                '--',
+                dev,
+            ],
+        )
+        command_check_call(
+            [
+                'sgdisk',
                 '--clear',
                 '--mbrtogpt',
                 '--',
@@ -1039,10 +1037,40 @@ def prepare_journal_dev(
     journal_dm_keypath,
     ):
 
+    reusing_partition = False
+
     if is_partition(journal):
         LOG.debug('Journal %s is a partition', journal)
         LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-        return (journal, None, None)
+        if get_partition_type(journal) == JOURNAL_UUID:
+            LOG.debug('Journal %s was previously prepared with ceph-disk. Reusing it.', journal)
+            reusing_partition = True
+            # Read and reuse the partition uuid from this journal's previous life.
+            # We reuse the uuid instead of changing it because udev does not reliably
+            # notice changes to an existing partition's GUID.
+            # See http://tracker.ceph.com/issues/10146
+            journal_uuid = get_partition_uuid(journal)
+            LOG.debug('Reusing journal with uuid %s', journal_uuid)
+        else:
+            LOG.warning('Journal %s was not prepared with ceph-disk. Symlinking directly.', journal)
+            return (journal, None, None)
+
+    journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
+        journal_uuid=journal_uuid,
+        )
+
+    journal_dmcrypt = None
+    if journal_dm_keypath:
+        journal_dmcrypt = journal_symlink
+        journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid)
+
+    if reusing_partition:
+        # confirm that the journal_symlink exists. It should since this was an active journal
+        # in the past. Continuing otherwise would be futile.
+        assert os.path.exists(journal_symlink)
+        return (journal_symlink, journal_dmcrypt, journal_uuid)
+
+    # From here on we are creating a new journal device, not reusing.
 
     ptype = JOURNAL_UUID
     if journal_dm_keypath:
@@ -1099,8 +1127,8 @@ def prepare_journal_dev(
                 '--mbrtogpt',
                 '--',
                 journal,
-            ],
-        )
+                ]
+            )
 
         update_partition('-a', journal, 'prepared')
 
@@ -1112,16 +1140,11 @@ def prepare_journal_dev(
                 ],
             )
 
-        journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
-            journal_uuid=journal_uuid,
-            )
+        LOG.debug('Journal is GPT partition %s', journal_symlink)
 
-        journal_dmcrypt = None
-        if journal_dm_keypath:
-            journal_dmcrypt = journal_symlink
-            journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid)
+        # udev should have created the symlink by now. If not, abort.
+        assert os.path.exists(journal_symlink)
 
-        LOG.debug('Journal is GPT partition %s', journal_symlink)
         return (journal_symlink, journal_dmcrypt, journal_uuid)
 
     except subprocess.CalledProcessError as e:
@@ -2158,6 +2181,13 @@ def get_dev_fs(dev):
         return None
 
 
+def split_dev_base_partnum(dev):
+    if 'loop' in dev or 'cciss' in dev or 'nvme' in dev:
+        return re.match('(.*\d+)p(\d+)', dev).group(1, 2)
+    else:
+        return re.match('(\D+)(\d+)', dev).group(1, 2)
+
+
 def get_partition_type(part):
     """
     Get the GPT partition type UUID.  If we have an old blkid and can't
@@ -2207,7 +2237,7 @@ def get_partition_type(part):
     if 'blkid' not in warned_about:
         LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
         warned_about['blkid'] = True
-    (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
+    (base, partnum) = split_dev_base_partnum(part)
     sgdisk, _ = command(
         [
             'sgdisk',
@@ -2233,7 +2263,7 @@ def get_partition_type(part):
 
 
 def get_partition_uuid(dev):
-    (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
+    (base, partnum) = split_dev_base_partnum(dev)
     out, _ = command(['sgdisk', '-i', partnum, base])
     for line in out.splitlines():
         m = re.match('Partition unique GUID: (\S+)', line)
diff --git a/src/ceph-post-file.in b/src/ceph-post-file.in
index 9b922a6..b278e8a 100755
--- a/src/ceph-post-file.in
+++ b/src/ceph-post-file.in
@@ -1,16 +1,16 @@
 #!/bin/bash -e
 
-# if we start up as ./$0, assume we are running from a source
-# checkout.
-if [ `dirname $0` = "." ] && [ $PWD != "/usr/bin" ]; then
-    known_hosts=../share/known_hosts_drop.ceph.com
-    ssh_key=../share/id_dsa_drop.ceph.com
-else
-    known_hosts=@datadir@/known_hosts_drop.ceph.com
-    ssh_key=@datadir@/id_dsa_drop.ceph.com
+# If these files exist, assume we are a source install.
+if [[ -f ../share/known_hosts_drop.ceph.com && -f ../share/id_dsa_drop.ceph.com ]]
+    then # running from source install
+       known_hosts=../share/known_hosts_drop.ceph.com
+       ssh_key=../share/id_dsa_drop.ceph.com
+    else # running from a pkg install
+       known_hosts=@datadir@/known_hosts_drop.ceph.com
+       ssh_key=@datadir@/id_dsa_drop.ceph.com
 fi
 
-usage() {
+function usage() {
     echo "Usage: $0 [options] file1 [dir2 ...]
 
 Easily upload files or directories to ceph.com for analysis by Ceph
@@ -155,7 +155,7 @@ done
 cp "$ssh_key" "$t4"
 cp "${ssh_key}.pub" "$t4.pub"
 
-sftp -i $t4 \
+sftp -o "IdentityFile=$t4" \
     -C \
     -oCheckHostIP=no \
     -oGlobalKnownHostsFile=$known_hosts \
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 54616f6..cc97938 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -120,9 +120,7 @@ int main(int argc, const char **argv, const char *envp[]) {
       goto out_mc_start_failed;
 
     // start up network
-    messenger = Messenger::create(g_ceph_context,
-				  entity_name_t::CLIENT(), "client",
-				  getpid());
+    messenger = Messenger::create_client_messenger(g_ceph_context, "client");
     messenger->set_default_policy(Messenger::Policy::lossy_client(0, 0));
     messenger->set_policy(entity_name_t::TYPE_MDS,
 			  Messenger::Policy::lossless_client(0, 0));
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index c3410aa..1d10fa2 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -65,9 +65,8 @@ int main(int argc, const char **argv, char *envp[])
 
   cout << "ceph-syn: starting " << g_conf->num_client << " syn client(s)" << std::endl;
   for (int i=0; i<g_conf->num_client; i++) {
-    messengers[i] = Messenger::create(g_ceph_context,
-				      entity_name_t(entity_name_t::TYPE_CLIENT,-1), "synclient",
-				      i * 1000000 + getpid());
+    messengers[i] = Messenger::create_client_messenger(g_ceph_context,
+						       "synclient");
     messengers[i]->bind(g_conf->public_addr);
     mclients[i] = new MonClient(g_ceph_context);
     mclients[i]->build_initial_monmap();
diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc
index f1e9a55..de66655 100644
--- a/src/common/Mutex.cc
+++ b/src/common/Mutex.cc
@@ -55,6 +55,7 @@ Mutex::Mutex(const char *n, bool r, bool ld,
     pthread_mutexattr_init(&attr);
     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
     pthread_mutex_init(&_m, &attr);
+    pthread_mutexattr_destroy(&attr);
     if (g_lockdep)
       _register();
   }
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
index 1a70ef1..308092c 100644
--- a/src/common/RWLock.h
+++ b/src/common/RWLock.h
@@ -19,6 +19,7 @@
 
 #include <pthread.h>
 #include "lockdep.h"
+#include "include/assert.h"
 #include "include/atomic.h"
 
 class RWLock
@@ -26,7 +27,7 @@ class RWLock
   mutable pthread_rwlock_t L;
   const char *name;
   mutable int id;
-  mutable atomic_t nrlock, nwlock;
+  mutable ceph::atomic_t nrlock, nwlock;
 
 public:
   RWLock(const RWLock& other);
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index a962e06..9f09a92 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -87,11 +87,11 @@ int Thread::kill(int signal)
 int Thread::try_create(size_t stacksize)
 {
   pthread_attr_t *thread_attr = NULL;
+  pthread_attr_t thread_attr_loc;
+  
   stacksize &= CEPH_PAGE_MASK;  // must be multiple of page
   if (stacksize) {
-    thread_attr = (pthread_attr_t*) malloc(sizeof(pthread_attr_t));
-    if (!thread_attr)
-      return -ENOMEM;
+    thread_attr = &thread_attr_loc;
     pthread_attr_init(thread_attr);
     pthread_attr_setstacksize(thread_attr, stacksize);
   }
@@ -113,8 +113,10 @@ int Thread::try_create(size_t stacksize)
   r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this);
   restore_sigset(&old_sigset);
 
-  if (thread_attr)
-    free(thread_attr);
+  if (thread_attr) {
+    pthread_attr_destroy(thread_attr);	
+  }
+
   return r;
 }
 
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
index 026d731..5c68a1f 100644
--- a/src/common/Throttle.cc
+++ b/src/common/Throttle.cc
@@ -267,6 +267,12 @@ void SimpleThrottle::end_op(int r)
   m_cond.Signal();
 }
 
+bool SimpleThrottle::pending_error() const
+{
+  Mutex::Locker l(m_lock);
+  return (m_ret < 0);
+}
+
 int SimpleThrottle::wait_for_ret()
 {
   Mutex::Locker l(m_lock);
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
index 6d03988..b171e27 100644
--- a/src/common/Throttle.h
+++ b/src/common/Throttle.h
@@ -76,9 +76,10 @@ public:
   ~SimpleThrottle();
   void start_op();
   void end_op(int r);
+  bool pending_error() const;
   int wait_for_ret();
 private:
-  Mutex m_lock;
+  mutable Mutex m_lock;
   Cond m_cond;
   uint64_t m_max;
   uint64_t m_current;
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index cbf49a8..a9eaffb 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -330,7 +330,7 @@ private:
 
 public:
   ThreadPool(CephContext *cct_, string nm, int n, const char *option = NULL);
-  ~ThreadPool();
+  virtual ~ThreadPool();
 
   /// return number of threads currently running
   int get_num_threads() {
@@ -340,10 +340,12 @@ public:
   
   /// assign a work queue to this thread pool
   void add_work_queue(WorkQueue_* wq) {
+    Mutex::Locker l(_lock);
     work_queues.push_back(wq);
   }
   /// remove a work queue from this thread pool
   void remove_work_queue(WorkQueue_* wq) {
+    Mutex::Locker l(_lock);
     unsigned i = 0;
     while (work_queues[i] != wq)
       i++;
@@ -433,4 +435,35 @@ public:
   }
 };
 
+class ContextWQ : public ThreadPool::WorkQueueVal<Context *> {
+public:
+  ContextWQ(const string &name, time_t ti, ThreadPool *tp)
+    : ThreadPool::WorkQueueVal<Context *>(name, ti, 0, tp) {}
+
+  void queue(Context *ctx) {
+    ThreadPool::WorkQueueVal<Context *>::queue(ctx);
+  }
+
+protected:
+  virtual void _enqueue(Context *item) {
+    _queue.push_back(item);
+  }
+  virtual void _enqueue_front(Context *item) {
+    _queue.push_front(item);
+  }
+  virtual bool _empty() {
+    return _queue.empty();
+  }
+  virtual Context *_dequeue() {
+    Context *item = _queue.front();
+    _queue.pop_front();
+    return item;
+  }
+  virtual void _process(Context *item) {
+    item->complete(0);
+  }
+private:
+  list<Context *> _queue;
+};
+
 #endif
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
index 4af2904..77048d4 100644
--- a/src/common/admin_socket.cc
+++ b/src/common/admin_socket.cc
@@ -149,6 +149,33 @@ std::string AdminSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr)
   return "";
 }
 
+std::string AdminSocket::destroy_shutdown_pipe()
+{
+  // Send a byte to the shutdown pipe that the thread is listening to
+  char buf[1] = { 0x0 };
+  int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
+
+  // Close write end
+  VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd));
+  m_shutdown_wr_fd = -1;
+
+  if (ret != 0) {
+    ostringstream oss;
+    oss << "AdminSocket::destroy_shutdown_pipe error: failed to write"
+      "to thread shutdown pipe: error " << ret;
+    return oss.str();
+  }
+
+  join();
+
+  // Close read end. Doing this before join() blocks the listenter and prevents
+  // joining.
+  VOID_TEMP_FAILURE_RETRY(close(m_shutdown_rd_fd));
+  m_shutdown_rd_fd = -1;
+
+  return "";
+}
+
 std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd)
 {
   ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl;
@@ -544,30 +571,31 @@ bool AdminSocket::init(const std::string &path)
 
 void AdminSocket::shutdown()
 {
+  std::string err;
+
+  // Under normal operation this is unlikely to occur.  However for some unit
+  // tests, some object members are not initialized and so cannot be deleted
+  // without fault.
   if (m_shutdown_wr_fd < 0)
     return;
 
   ldout(m_cct, 5) << "shutdown" << dendl;
 
-  // Send a byte to the shutdown pipe that the thread is listening to
-  char buf[1] = { 0x0 };
-  int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
-  VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd));
-  m_shutdown_wr_fd = -1;
-
-  if (ret == 0) {
-    join();
-  } else {
-    lderr(m_cct) << "AdminSocket::shutdown: failed to write "
-      "to thread shutdown pipe: error " << ret << dendl;
+  err = destroy_shutdown_pipe();
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocket::shutdown: error: " << err << dendl;
   }
 
+  VOID_TEMP_FAILURE_RETRY(close(m_sock_fd));
+
   unregister_command("version");
   unregister_command("git_version");
   unregister_command("0");
   delete m_version_hook;
+
   unregister_command("help");
   delete m_help_hook;
+
   unregister_command("get_command_descriptions");
   delete m_getdescs_hook;
 
diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h
index 3bc8483..5e855bc 100644
--- a/src/common/admin_socket.h
+++ b/src/common/admin_socket.h
@@ -79,6 +79,7 @@ private:
   void shutdown();
 
   std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr);
+  std::string destroy_shutdown_pipe();
   std::string bind_and_listen(const std::string &sock_path, int *fd);
 
   void *entry();
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 35c5d36..fc00f0b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1060,12 +1060,23 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE
 	 it != _buffers.end();
 	 ++it) {
       if (p + it->length() > o) {
-	if (p >= o && p+it->length() <= o+l)
-	  it->zero();                         // all
-	else if (p >= o) 
-	  it->zero(0, o+l-p);                 // head
-	else
-	  it->zero(o-p, it->length()-(o-p));  // tail
+        if (p >= o && p+it->length() <= o+l) {
+          // 'o'------------- l -----------|
+          //      'p'-- it->length() --|
+	  it->zero();
+        } else if (p >= o) {
+          // 'o'------------- l -----------|
+          //    'p'------- it->length() -------|
+	  it->zero(0, o+l-p);
+        } else if (p + it->length() <= o+l) {
+          //     'o'------------- l -----------|
+          // 'p'------- it->length() -------|
+	  it->zero(o-p, it->length()-(o-p));
+        } else {
+          //       'o'----------- l -----------|
+          // 'p'---------- it->length() ----------|
+          it->zero(o-p, l);
+        }
       }
       p += it->length();
       if (o+l <= p)
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 4ebf79e..77488b6 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -265,6 +265,7 @@ CephContext::CephContext(uint32_t module_type_)
     _crypto_aes(NULL)
 {
   ceph_spin_init(&_service_thread_lock);
+  ceph_spin_init(&_associated_objs_lock);
 
   _log = new ceph::log::Log(&_conf->subsys);
   _log->start();
@@ -298,6 +299,10 @@ CephContext::~CephContext()
 {
   join_service_thread();
 
+  for (map<string, AssociatedSingletonObject*>::iterator it = _associated_objs.begin();
+       it != _associated_objs.end(); it++)
+    delete it->second;
+
   if (_conf->lockdep) {
     lockdep_unregister_ceph_context(this);
   }
@@ -335,6 +340,7 @@ CephContext::~CephContext()
 
   delete _conf;
   ceph_spin_destroy(&_service_thread_lock);
+  ceph_spin_destroy(&_associated_objs_lock);
 
   delete _crypto_none;
   delete _crypto_aes;
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index ba60620..7241c85 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -17,6 +17,7 @@
 
 #include <iostream>
 #include <stdint.h>
+#include <string>
 
 #include "include/buffer.h"
 #include "include/atomic.h"
@@ -58,6 +59,10 @@ private:
   ~CephContext();
   atomic_t nref;
 public:
+  class AssociatedSingletonObject {
+   public:
+    virtual ~AssociatedSingletonObject() {}
+  };
   CephContext *get() {
     nref.inc();
     return this;
@@ -102,6 +107,17 @@ public:
   void do_command(std::string command, cmdmap_t& cmdmap, std::string format,
 		  bufferlist *out);
 
+  template<typename T>
+  void lookup_or_create_singleton_object(T*& p, const std::string &name) {
+    ceph_spin_lock(&_associated_objs_lock);
+    if (!_associated_objs.count(name)) {
+      p = new T(this);
+      _associated_objs[name] = reinterpret_cast<AssociatedSingletonObject*>(p);
+    } else {
+      p = reinterpret_cast<T*>(_associated_objs[name]);
+    }
+    ceph_spin_unlock(&_associated_objs_lock);
+  }
   /**
    * get a crypto handler
    */
@@ -138,6 +154,9 @@ private:
 
   ceph::HeartbeatMap *_heartbeat_map;
 
+  ceph_spinlock_t _associated_objs_lock;
+  std::map<std::string, AssociatedSingletonObject*> _associated_objs;
+
   // crypto
   CryptoNone *_crypto_none;
   CryptoAES *_crypto_aes;
diff --git a/src/common/config.cc b/src/common/config.cc
index fc47083..7650b88 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -148,7 +148,7 @@ md_config_t::md_config_t()
 #undef OPTION
 #undef SUBSYS
 #undef DEFAULT_SUBSYS
-  lock("md_config_t", true)
+  lock("md_config_t", true, false)
 {
   init_subsys();
 }
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index d8ecdc7..1033a8e 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -137,6 +137,8 @@ OPTION(mon_sync_fs_threshold, OPT_INT, 5)   // sync() when writing this many obj
 OPTION(mon_compact_on_start, OPT_BOOL, false)  // compact leveldb on ceph-mon start
 OPTION(mon_compact_on_bootstrap, OPT_BOOL, false)  // trigger leveldb compaction on bootstrap
 OPTION(mon_compact_on_trim, OPT_BOOL, true)       // compact (a prefix) when we trim old states
+OPTION(mon_osd_cache_size, OPT_INT, 10)  // the size of osdmaps cache, not to rely on underlying store's cache
+
 OPTION(mon_tick_interval, OPT_INT, 5)
 OPTION(mon_subscribe_interval, OPT_DOUBLE, 300)
 OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10)   // seconds of inactivity before we reset the pg delta to 0
@@ -172,7 +174,8 @@ OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000)  // do not warn on pools bel
 OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
-OPTION(mon_globalid_prealloc, OPT_INT, 100)   // how many globalids to prealloc
+OPTION(mon_allow_pool_delete, OPT_BOOL, true) // allow pool deletion
+OPTION(mon_globalid_prealloc, OPT_INT, 10000)   // how many globalids to prealloc
 OPTION(mon_osd_report_timeout, OPT_INT, 900)    // grace period before declaring unresponsive OSDs dead
 OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
 OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
@@ -452,6 +455,7 @@ OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
 OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4)
 OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
 OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
+OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
 
 OPTION(osd_map_dedup, OPT_BOOL, true)
 OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
@@ -475,14 +479,21 @@ OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10)
 OPTION(osd_backfill_scan_min, OPT_INT, 64)
 OPTION(osd_backfill_scan_max, OPT_INT, 512)
 OPTION(osd_op_thread_timeout, OPT_INT, 15)
+OPTION(osd_op_thread_suicide_timeout, OPT_INT, 150)
 OPTION(osd_recovery_thread_timeout, OPT_INT, 30)
+OPTION(osd_recovery_thread_suicide_timeout, OPT_INT, 300)
 OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
+OPTION(osd_snap_trim_thread_suicide_timeout, OPT_INT, 60*60*10)
 OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
 OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
+OPTION(osd_scrub_thread_suicide_timeout, OPT_INT, 60)
 OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
+OPTION(osd_scrub_finalize_thread_suicide_timeout, OPT_INT, 60*10*10)
 OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
 OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
+OPTION(osd_remove_thread_suicide_timeout, OPT_INT, 10*60*60)
 OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
+OPTION(osd_command_thread_suicide_timeout, OPT_INT, 15*60)
 OPTION(osd_age, OPT_FLOAT, .8)
 OPTION(osd_age_time, OPT_INT, 0)
 OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t())
@@ -727,6 +738,9 @@ OPTION(journal_ignore_corruption, OPT_BOOL, false) // assume journal is not corr
 OPTION(rados_mon_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means on limit.
 OPTION(rados_osd_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
 
+OPTION(rbd_op_threads, OPT_INT, 1)
+OPTION(rbd_op_thread_timeout, OPT_INT, 60)
+OPTION(rbd_non_blocking_aio, OPT_BOOL, true) // process AIO ops from a worker thread to prevent blocking
 OPTION(rbd_cache, OPT_BOOL, false) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
 OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL, false) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
 OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20)         // cache size in bytes
@@ -784,6 +798,7 @@ OPTION(rgw_swift_url_prefix, OPT_STR, "swift") // entry point for which a url is
 OPTION(rgw_swift_auth_url, OPT_STR, "")        // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
 OPTION(rgw_swift_auth_entry, OPT_STR, "auth")  // entry point for which a url is considered a swift auth url
 OPTION(rgw_swift_tenant_name, OPT_STR, "")  // tenant name to use for swift access
+OPTION(rgw_swift_enforce_content_length, OPT_BOOL, false)  // enforce generation of Content-Length even in cost of performance or scalability
 OPTION(rgw_keystone_url, OPT_STR, "")  // url for keystone server
 OPTION(rgw_keystone_admin_token, OPT_STR, "")  // keystone admin token (shared secret)
 OPTION(rgw_keystone_admin_user, OPT_STR, "")  // keystone admin user name
diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h
index eff18d2..7b392a2 100644
--- a/src/common/sync_filesystem.h
+++ b/src/common/sync_filesystem.h
@@ -42,13 +42,17 @@ inline int sync_filesystem(int fd)
     return 0;
 #endif
 
-#ifdef BTRFS_IOC_SYNC
-  if (::ioctl(fd, BTRFS_IOC_SYNC) == 0)
+#if defined(HAVE_SYS_SYNCFS) || defined(SYS_syncfs) || defined(__NR_syncfs)
+  else if (errno == ENOSYS) {
+    sync();
     return 0;
-#endif
-
+  } else {
+    return -errno;
+  }
+#else
   sync();
   return 0;
+#endif
 }
 
 #endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index b17829b..abf79af 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -113,6 +113,9 @@ bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool
   if (_search_item_exists(item)) {
     return false;
   }
+  if (item < 0 && _bucket_is_in_use(cct, item)) {
+    return false;
+  }
 
   if (item < 0 && !unlink_only) {
     crush_bucket *t = get_bucket(item);
@@ -140,6 +143,9 @@ int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
 		    << " items, not empty" << dendl;
       return -ENOTEMPTY;
     }
+    if (_bucket_is_in_use(cct, item)) {
+      return -EBUSY;
+    }
   }
 
   for (int i = 0; i < crush->max_buckets; i++) {
@@ -179,6 +185,22 @@ bool CrushWrapper::_search_item_exists(int item) const
   return false;
 }
 
+bool CrushWrapper::_bucket_is_in_use(CephContext *cct, int item)
+{
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE &&
+	  r->steps[j].arg1 == item) {
+	return true;
+      }
+    }
+  }
+  return false;
+}
+
 int CrushWrapper::_remove_item_under(CephContext *cct, int item, int ancestor, bool unlink_only)
 {
   ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
@@ -214,6 +236,11 @@ int CrushWrapper::remove_item_under(CephContext *cct, int item, int ancestor, bo
 {
   ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
 		<< (unlink_only ? " unlink_only":"") << dendl;
+
+  if (!unlink_only && _bucket_is_in_use(cct, item)) {
+    return -EBUSY;
+  }
+
   int ret = _remove_item_under(cct, item, ancestor, unlink_only);
   if (ret < 0)
     return ret;
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 9fac2fe..8e28597 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -518,6 +518,7 @@ public:
 private:
   bool _maybe_remove_last_instance(CephContext *cct, int id, bool unlink_only);
   int _remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
+  bool _bucket_is_in_use(CephContext *cct, int id);
 public:
   int remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
 
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
index 7affb87..9a7d2f2 100644
--- a/src/include/interval_set.h
+++ b/src/include/interval_set.h
@@ -371,6 +371,7 @@ class interval_set {
   
   void erase(iterator &i) {
     _size -= i.get_len();
+    assert(_size >= 0);
     m.erase(i._iter);
   }
 
diff --git a/src/init-radosgw b/src/init-radosgw
index 5aa658c..7af51ef 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -66,8 +66,10 @@ case "$1" in
             fi
 
             log_file=`$RADOSGW -n $name --show-config-value log_file`
-            if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then
-                touch "$log_file"
+            if [ -n "$log_file" ]; then
+                if [ ! -e "$log_file" ]; then
+                    touch "$log_file"
+                fi
                 chown $user $log_file
             fi
 
@@ -85,7 +87,15 @@ case "$1" in
         $0 start
         ;;
     stop)
-        start-stop-daemon --stop -x $RADOSGW --oknodo
+        timeout=0
+        for name in `ceph-conf --list-sections $PREFIX`;
+        do
+          t=`$RADOSGW -n $name --show-config-value rgw_exit_timeout_secs`
+          if [ $t -gt $timeout ]; then timeout=$t; fi
+        done
+
+        if [ $timeout -gt 0 ]; then TIMEOUT="-R $timeout"; fi
+        start-stop-daemon --stop -x $RADOSGW --oknodo $TIMEOUT
         ;;
     status)
         daemon_is_running $RADOSGW
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
index fe56939..1bb8a89 100644
--- a/src/init-radosgw.sysv
+++ b/src/init-radosgw.sysv
@@ -73,8 +73,10 @@ case "$1" in
             fi
 
             log_file=`$RADOSGW -n $name --show-config-value log_file`
-            if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then
-                touch "$log_file"
+            if [ -n "$log_file" ]; then
+                if [ ! -e "$log_file" ]; then
+                    touch "$log_file"
+                fi
                 chown $user $log_file
             fi
 
@@ -99,8 +101,19 @@ case "$1" in
         ;;
     stop)
         #start-stop-daemon --stop -x $RADOSGW --oknodo
+        timeout=0
+        for name in `ceph-conf --list-sections $PREFIX`;
+        do
+          t=`$RADOSGW -n $name --show-config-value rgw_exit_timeout_secs`
+          if [ $t -gt $timeout ]; then timeout=$t; fi
+        done
+
         killproc $RADOSGW
         echo "Stopping radosgw instance(s)..."
+        while pidof $RADOSGW >/dev/null && [ $timeout -gt 0 ] ; do
+          sleep 1
+          timeout=$(($timeout - 1))
+        done
         ;;
     status)
         daemon_is_running $RADOSGW
diff --git a/src/json_spirit/json_spirit_reader_template.h b/src/json_spirit/json_spirit_reader_template.h
index f87b593..c50f885 100644
--- a/src/json_spirit/json_spirit_reader_template.h
+++ b/src/json_spirit/json_spirit_reader_template.h
@@ -13,6 +13,8 @@
 #include "json_spirit_value.h"
 #include "json_spirit_error_position.h"
 
+#include "common/utf8.h"
+
 #define BOOST_SPIRIT_THREADSAFE  // uncomment for multithreaded use, requires linking to boost.thread
 
 #include <boost/bind.hpp>
@@ -71,18 +73,30 @@ namespace json_spirit
         return ( hex_to_num( c1 ) << 4 ) + hex_to_num( c2 );
     }       
 
-    template< class Char_type, class Iter_type >
-    Char_type unicode_str_to_char( Iter_type& begin )
+    template< class String_type, class Iter_type >
+    String_type unicode_str_to_utf8( Iter_type& begin );
+
+    template<>
+    std::string unicode_str_to_utf8( std::string::const_iterator & begin )
     {
+        typedef std::string::value_type Char_type;
+
         const Char_type c1( *( ++begin ) );
         const Char_type c2( *( ++begin ) );
         const Char_type c3( *( ++begin ) );
         const Char_type c4( *( ++begin ) );
 
-        return ( hex_to_num( c1 ) << 12 ) + 
-               ( hex_to_num( c2 ) <<  8 ) + 
-               ( hex_to_num( c3 ) <<  4 ) + 
-               hex_to_num( c4 );
+        unsigned long uc = ( hex_to_num( c1 ) << 12 ) + 
+                           ( hex_to_num( c2 ) <<  8 ) + 
+                           ( hex_to_num( c3 ) <<  4 ) + 
+                           hex_to_num( c4 );
+
+        unsigned char buf[7];  // MAX_UTF8_SZ is 6 (see src/common/utf8.c)
+        int r = encode_utf8(uc, buf);
+        if (r >= 0) {
+            return std::string(reinterpret_cast<char *>(buf), r);
+        }
+        return std::string("_");
     }
 
     template< class String_type >
@@ -116,7 +130,7 @@ namespace json_spirit
             {
                 if( end - begin >= 5 )  //  expecting "uHHHH..."
                 {
-                    s += unicode_str_to_char< Char_type >( begin );  
+                    s += unicode_str_to_utf8< String_type >( begin );
                 }
                 break;
             }
@@ -178,11 +192,15 @@ namespace json_spirit
         return get_str_< std::string >( begin, end );
     }
 
+// Need this guard else it tries to instantiate unicode_str_to_utf8 with a
+// std::wstring, which isn't presently implemented
+#if defined( JSON_SPIRIT_WMVALUE_ENABLED ) && !defined( BOOST_NO_STD_WSTRING )
     inline std::wstring get_str( std::wstring::const_iterator begin, std::wstring::const_iterator end )
     {
         return get_str_< std::wstring >( begin, end );
     }
-    
+#endif
+
     template< class String_type, class Iter_type >
     String_type get_str( Iter_type begin, Iter_type end )
     {
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index 88e86ba..a8a32f4 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -34,9 +34,8 @@
 struct ceph_mount_info
 {
 public:
-  ceph_mount_info(uint64_t msgr_nonce_, CephContext *cct_)
-    : msgr_nonce(msgr_nonce_),
-      mounted(false),
+  ceph_mount_info(CephContext *cct_)
+    : mounted(false),
       inited(false),
       client(NULL),
       monclient(NULL),
@@ -80,7 +79,7 @@ public:
       goto fail;
 
     //network connection
-    messenger = Messenger::create(cct, entity_name_t::CLIENT(), "client", msgr_nonce);
+    messenger = Messenger::create_client_messenger(cct, "client");
 
     //at last the client
     ret = -1002;
@@ -215,7 +214,6 @@ public:
   }
 
 private:
-  uint64_t msgr_nonce;
   bool mounted;
   bool inited;
   Client *client;
@@ -242,14 +240,7 @@ extern "C" const char *ceph_version(int *pmajor, int *pminor, int *ppatch)
 
 extern "C" int ceph_create_with_context(struct ceph_mount_info **cmount, CephContext *cct)
 {
-  uint64_t nonce = 0;
-
-  // 6 bytes of random and 2 bytes of pid
-  get_random_bytes((char*)&nonce, sizeof(nonce));
-  nonce &= ~0xffff;
-  nonce |= (uint64_t)getpid();
-
-  *cmount = new struct ceph_mount_info(nonce, cct);
+  *cmount = new struct ceph_mount_info(cct);
   return 0;
 }
 
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 6fc22ad..887b390 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -815,7 +815,7 @@ int librados::IoCtxImpl::hit_set_list(uint32_t hash, AioCompletionImpl *c,
   ::ObjectOperation rd;
   rd.hit_set_ls(pls, NULL);
   object_locator_t oloc(poolid);
-  objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL);
+  objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL);
   return 0;
 }
 
@@ -831,7 +831,7 @@ int librados::IoCtxImpl::hit_set_get(uint32_t hash, AioCompletionImpl *c,
   ::ObjectOperation rd;
   rd.hit_set_get(utime_t(stamp, 0), pbl, 0);
   object_locator_t oloc(poolid);
-  objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL);
+  objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL);
   return 0;
 }
 
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index a5e77a5..572aa25 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -30,7 +30,7 @@
 
 #include "messages/MWatchNotify.h"
 #include "messages/MLog.h"
-#include "msg/SimpleMessenger.h"
+#include "msg/Messenger.h"
 
 // needed for static_cast
 #include "messages/PaxosServiceMessage.h"
@@ -52,8 +52,6 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "librados: "
 
-static atomic_t rados_instance;
-
 bool librados::RadosClient::ms_get_authorizer(int dest_type,
 					      AuthAuthorizer **authorizer,
 					      bool force_new) {
@@ -206,7 +204,6 @@ int librados::RadosClient::connect()
   common_init_finish(cct);
 
   int err;
-  uint64_t nonce;
 
   // already connected?
   if (state == CONNECTING)
@@ -221,8 +218,7 @@ int librados::RadosClient::connect()
     goto out;
 
   err = -ENOMEM;
-  nonce = getpid() + (1000000 * (uint64_t)rados_instance.inc());
-  messenger = new SimpleMessenger(cct, entity_name_t::CLIENT(-1), "radosclient", nonce);
+  messenger = Messenger::create_client_messenger(cct, "radosclient");
   if (!messenger)
     goto out;
 
diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h
index e608ced..3e12b55 100644
--- a/src/librados/RadosClient.h
+++ b/src/librados/RadosClient.h
@@ -33,7 +33,7 @@ struct md_config_t;
 class Message;
 class MWatchNotify;
 class MLog;
-class SimpleMessenger;
+class Messenger;
 
 class librados::RadosClient : public Dispatcher
 {
@@ -49,7 +49,7 @@ private:
 
   OSDMap osdmap;
   MonClient monclient;
-  SimpleMessenger *messenger;
+  Messenger *messenger;
 
   uint64_t instance_id;
 
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index 86b5b50..e818674 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -5,6 +5,7 @@
 
 #include "common/ceph_context.h"
 #include "common/dout.h"
+#include "common/errno.h"
 
 #include "librbd/AioRequest.h"
 #include "librbd/internal.h"
@@ -25,7 +26,7 @@ namespace librbd {
     building = false;
     if (!pending_count) {
       finalize(cct, rval);
-      complete();
+      complete(cct);
     }
     lock.Unlock();
   }
@@ -54,6 +55,49 @@ namespace librbd {
     }
   }
 
+  void AioCompletion::complete(CephContext *cct) {
+    utime_t elapsed;
+    assert(lock.is_locked());
+    elapsed = ceph_clock_now(cct) - start_time;
+    switch (aio_type) {
+    case AIO_TYPE_READ:
+      ictx->perfcounter->tinc(l_librbd_aio_rd_latency, elapsed); break;
+    case AIO_TYPE_WRITE:
+      ictx->perfcounter->tinc(l_librbd_aio_wr_latency, elapsed); break;
+    case AIO_TYPE_DISCARD:
+      ictx->perfcounter->tinc(l_librbd_aio_discard_latency, elapsed); break;
+    case AIO_TYPE_FLUSH:
+      ictx->perfcounter->tinc(l_librbd_aio_flush_latency, elapsed); break;
+    default:
+      lderr(cct) << "completed invalid aio_type: " << aio_type << dendl;
+      break;
+    }
+
+    if (ictx != NULL) {
+      Mutex::Locker l(ictx->aio_lock);
+      assert(ictx->pending_aio != 0);
+      --ictx->pending_aio;
+      ictx->pending_aio_cond.Signal();
+    }
+
+    if (complete_cb) {
+      complete_cb(rbd_comp, complete_arg);
+    }
+    done = true;
+    cond.Signal();
+  }
+
+  void AioCompletion::fail(CephContext *cct, int r)
+  {
+    lderr(cct) << "AioCompletion::fail() " << this << ": " << cpp_strerror(r)
+               << dendl;
+    lock.Lock();
+    assert(pending_count == 0);
+    rval = r;
+    complete(cct);
+    put_unlock();
+  }
+
   void AioCompletion::complete_request(CephContext *cct, ssize_t r)
   {
     ldout(cct, 20) << "AioCompletion::complete_request() "
@@ -70,7 +114,7 @@ namespace librbd {
     int count = --pending_count;
     if (!count && !building) {
       finalize(cct, rval);
-      complete();
+      complete(cct);
     }
     put_unlock();
   }
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index e28cd6a..4dbad52 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -101,37 +101,8 @@ namespace librbd {
       start_time = ceph_clock_now(ictx->cct);
     }
 
-    void complete() {
-      utime_t elapsed;
-      assert(lock.is_locked());
-      elapsed = ceph_clock_now(ictx->cct) - start_time;
-      switch (aio_type) {
-      case AIO_TYPE_READ:
-	ictx->perfcounter->tinc(l_librbd_aio_rd_latency, elapsed); break;
-      case AIO_TYPE_WRITE:
-	ictx->perfcounter->tinc(l_librbd_aio_wr_latency, elapsed); break;
-      case AIO_TYPE_DISCARD:
-	ictx->perfcounter->tinc(l_librbd_aio_discard_latency, elapsed); break;
-      case AIO_TYPE_FLUSH:
-	ictx->perfcounter->tinc(l_librbd_aio_flush_latency, elapsed); break;
-      default:
-	lderr(ictx->cct) << "completed invalid aio_type: " << aio_type << dendl;
-	break;
-      }
-
-      {
-        Mutex::Locker l(ictx->aio_lock);
-        assert(ictx->pending_aio != 0);
-        --ictx->pending_aio;
-        ictx->pending_aio_cond.Signal();
-      }
-
-      if (complete_cb) {
-	complete_cb(rbd_comp, complete_arg);
-      }
-      done = true;
-      cond.Signal();
-    }
+    void complete(CephContext *cct);
+    void fail(CephContext *cct, int r);
 
     void set_complete_cb(void *cb_arg, callback_t cb) {
       complete_cb = cb;
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
index 5cf9a11..dee6eba 100644
--- a/src/librbd/AioRequest.cc
+++ b/src/librbd/AioRequest.cc
@@ -85,8 +85,9 @@ namespace librbd {
     return true;
   }
 
-  int AioRead::send() {
-    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl;
+  void AioRead::send() {
+    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len << dendl;
 
     librados::AioCompletion *rados_completion =
       librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
@@ -99,10 +100,11 @@ namespace librbd {
     } else {
       op.read(m_object_off, m_object_len, &m_read_data, NULL);
     }
+
     r = m_ioctx->aio_operate(m_oid, rados_completion, &op, flags, NULL);
+    assert(r == 0);
 
     rados_completion->release();
-    return r;
   }
 
   /** write **/
@@ -224,16 +226,17 @@ namespace librbd {
     return finished;
   }
 
-  int AbstractWrite::send() {
-    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl;
+  void AbstractWrite::send() {
+    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len << dendl;
     librados::AioCompletion *rados_completion =
       librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
     int r;
     assert(m_write.size());
     r = m_ioctx->aio_operate(m_oid, rados_completion, &m_write,
 			     m_snap_seq, m_snaps);
+    assert(r == 0);
     rados_completion->release();
-    return r;
   }
 
   void AbstractWrite::send_copyup() {
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index d6103f9..882b535 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -43,7 +43,7 @@ namespace librbd {
     }
 
     virtual bool should_complete(int r) = 0;
-    virtual int send() = 0;
+    virtual void send() = 0;
 
   protected:
     void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents);
@@ -73,7 +73,7 @@ namespace librbd {
     }
     virtual ~AioRead() {}
     virtual bool should_complete(int r);
-    virtual int send();
+    virtual void send();
 
     ceph::bufferlist &data() {
       return m_read_data;
@@ -100,7 +100,7 @@ namespace librbd {
 		  bool hide_enoent);
     virtual ~AbstractWrite() {}
     virtual bool should_complete(int r);
-    virtual int send();
+    virtual void send();
     void guard_write();
 
     bool has_parent() const {
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 1295d42..ce403a7 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -6,6 +6,7 @@
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/perf_counters.h"
+#include "common/WorkQueue.h"
 
 #include "librbd/internal.h"
 #include "librbd/WatchCtx.h"
@@ -27,6 +28,23 @@ using librados::snap_t;
 using librados::IoCtx;
 
 namespace librbd {
+
+namespace {
+
+class ThreadPoolSingleton : public ThreadPool {
+public:
+  ThreadPoolSingleton(CephContext *cct)
+    : ThreadPool(cct, "librbd::thread_pool", cct->_conf->rbd_op_threads,
+                 "rbd_op_threads") {
+    start();
+  }
+  virtual ~ThreadPoolSingleton() {
+    stop();
+  }
+};
+
+} // anonymous namespace
+
   ImageCtx::ImageCtx(const string &image_name, const string &image_id,
 		     const char *snap, IoCtx& p, bool ro)
     : cct((CephContext*)p.cct()),
@@ -53,7 +71,7 @@ namespace librbd {
       id(image_id), parent(NULL),
       stripe_unit(0), stripe_count(0),
       object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
-      pending_aio(0)
+      pending_aio(0), aio_work_queue(NULL)
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
@@ -98,6 +116,13 @@ namespace librbd {
       object_set->return_enoent = true;
       object_cacher->start();
     }
+
+    ThreadPoolSingleton *thread_pool_singleton;
+    cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+      thread_pool_singleton, "librbd::thread_pool");
+    aio_work_queue = new ContextWQ("librbd::aio_work_queue",
+                                   cct->_conf->rbd_op_thread_timeout,
+                                   thread_pool_singleton);
   }
 
   ImageCtx::~ImageCtx() {
@@ -115,6 +140,8 @@ namespace librbd {
       object_set = NULL;
     }
     delete[] format_string;
+
+    delete aio_work_queue;
   }
 
   int ImageCtx::init() {
@@ -189,10 +216,9 @@ namespace librbd {
     if (object_cacher) {
       uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
       if (!obj) {
-        obj = cct->_conf->rbd_cache_size / (1ull << order);
-        obj = obj * 4 + 10;
+        obj = MIN(2000, MAX(10, cct->_conf->rbd_cache_size / 100 / sizeof(ObjectCacher::Object)));
       }
-      ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
+      ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size
 		     << " -> about " << obj << " objects" << dendl;
       object_cacher->set_max_objects(obj);
     }
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 5a0d637..406192b 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -26,6 +26,7 @@
 #include "librbd/parent_types.h"
 
 class CephContext;
+class ContextWQ;
 class PerfCounters;
 
 namespace librbd {
@@ -95,6 +96,8 @@ namespace librbd {
     Cond pending_aio_cond;
     uint64_t pending_aio;
 
+    ContextWQ *aio_work_queue;
+
     /**
      * Either image_name or image_id must be set.
      * If id is not known, pass the empty std::string,
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 1456012..9ed7bd9 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -9,6 +9,7 @@
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/Throttle.h"
+#include "common/WorkQueue.h"
 #include "cls/lock/cls_lock_client.h"
 #include "include/stringify.h"
 
@@ -1420,7 +1421,6 @@ reprotect_and_return_err:
         close_image(ictx);
         return -EBUSY;
       }
-      assert(watchers.size() == 1);
 
       ictx->md_lock.get_read();
       trim_image(ictx, 0, prog_ctx);
@@ -2013,13 +2013,7 @@ reprotect_and_return_err:
 
       Context *ctx = new C_CopyWrite(m_throttle, m_bl);
       AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      r = aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp);
-      if (r < 0) {
-	ctx->complete(r);
-	comp->release();
-	lderr(m_dest->cct) << "error writing to destination image at offset "
-			   << m_offset << ": " << cpp_strerror(r) << dendl;
-      }
+      aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp);
     }
   private:
     SimpleThrottle *m_throttle;
@@ -2052,20 +2046,15 @@ reprotect_and_return_err:
     SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
     uint64_t period = src->get_stripe_period();
     for (uint64_t offset = 0; offset < src_size; offset += period) {
+      if (throttle.pending_error()) {
+        return throttle.wait_for_ret();
+      }
+
       uint64_t len = min(period, src_size - offset);
       bufferlist *bl = new bufferlist();
       Context *ctx = new C_CopyRead(&throttle, dest, offset, bl);
       AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      r = aio_read(src, offset, len, NULL, bl, comp);
-      if (r < 0) {
-	ctx->complete(r);
-	comp->release();
-	throttle.wait_for_ret();
-	lderr(cct) << "could not read from source image from "
-		   << offset << " to " << offset + len << ": "
-		   << cpp_strerror(r) << dendl;
-	return r;
-      }
+      aio_read(src, offset, len, NULL, bl, comp);
       prog_ctx.update_progress(offset, src_size);
     }
 
@@ -2151,6 +2140,9 @@ reprotect_and_return_err:
   void close_image(ImageCtx *ictx)
   {
     ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
+
+    ictx->aio_work_queue->drain();
+
     if (ictx->object_cacher) {
       ictx->shutdown_cache(); // implicitly flushes
     } else {
@@ -2221,6 +2213,10 @@ reprotect_and_return_err:
     SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
 
     for (uint64_t ono = 0; ono < overlap_objects; ono++) {
+      if (throttle.pending_error()) {
+        return throttle.wait_for_ret();
+      }
+
       {
 	RWLock::RLocker l(ictx->parent_lock);
 	// stop early if the parent went away - it just means
@@ -2244,12 +2240,7 @@ reprotect_and_return_err:
       Context *comp = new C_SimpleThrottle(&throttle);
       AioWrite *req = new AioWrite(ictx, oid, ono, 0, objectx, object_overlap,
 				   bl, snapc, CEPH_NOSNAP, comp);
-      r = req->send();
-      if (r < 0) {
-	lderr(cct) << "failed to flatten object " << oid << dendl;
-	goto err;
-      }
-
+      req->send();
       prog_ctx.update_progress(ono, overlap_objects);
     }
 
@@ -2439,12 +2430,7 @@ reprotect_and_return_err:
 
       Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
       AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      r = aio_read(ictx, off, read_len, NULL, &bl, c);
-      if (r < 0) {
-	c->release();
-	delete ctx;
-	return r;
-      }
+      aio_read(ictx, off, read_len, NULL, &bl, c);
 
       mylock.Lock();
       while (!done)
@@ -2674,12 +2660,7 @@ reprotect_and_return_err:
 
     Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
     AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    int r = aio_read(ictx, image_extents, buf, pbl, c);
-    if (r < 0) {
-      c->release();
-      delete ctx;
-      return r;
-    }
+    aio_read(ictx, image_extents, buf, pbl, c);
 
     mylock.Lock();
     while (!done)
@@ -2708,12 +2689,7 @@ reprotect_and_return_err:
 
     Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
     AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    r = aio_write(ictx, off, mylen, buf, c);
-    if (r < 0) {
-      c->release();
-      delete ctx;
-      return r;
-    }
+    aio_write(ictx, off, mylen, buf, c);
 
     mylock.Lock();
     while (!done)
@@ -2744,12 +2720,7 @@ reprotect_and_return_err:
 
     Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
     AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    int r = aio_discard(ictx, off, len, c);
-    if (r < 0) {
-      c->release();
-      delete ctx;
-      return r;
-    }
+    aio_discard(ictx, off, len, c);
 
     mylock.Lock();
     while (!done)
@@ -2867,18 +2838,20 @@ reprotect_and_return_err:
     return 0;
   }
 
-  int aio_flush(ImageCtx *ictx, AioCompletion *c)
+  void aio_flush(ImageCtx *ictx, AioCompletion *c)
   {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "aio_flush " << ictx << " completion " << c <<  dendl;
 
+    c->get();
     int r = ictx_check(ictx);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      c->fail(cct, r);
+      return;
+    }
 
     ictx->user_flushed();
 
-    c->get();
     c->add_request();
     c->init_time(ictx, AIO_TYPE_FLUSH);
     C_AioWrite *req_comp = new C_AioWrite(cct, c);
@@ -2893,8 +2866,6 @@ reprotect_and_return_err:
     c->finish_adding_requests(cct);
     c->put();
     ictx->perfcounter->inc(l_librbd_aio_flush);
-
-    return 0;
   }
 
   int flush(ImageCtx *ictx)
@@ -2942,21 +2913,26 @@ reprotect_and_return_err:
     return ictx->invalidate_cache();
   }
 
-  int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
-		AioCompletion *c)
+  void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
+		 AioCompletion *c)
   {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "aio_write " << ictx << " off = " << off << " len = "
 		   << len << " buf = " << (void*)buf << dendl;
 
+    c->get();
     int r = ictx_check(ictx);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      c->fail(cct, r);
+      return;
+    }
 
     uint64_t mylen = len;
     r = clip_io(ictx, off, &mylen);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      c->fail(cct, r);
+      return;
+    }
 
     ictx->snap_lock.get_read();
     snapid_t snap_id = ictx->snap_id;
@@ -2967,8 +2943,10 @@ reprotect_and_return_err:
     ictx->parent_lock.put_read();
     ictx->snap_lock.put_read();
 
-    if (snap_id != CEPH_NOSNAP || ictx->read_only)
-      return -EROFS;
+    if (snap_id != CEPH_NOSNAP || ictx->read_only) {
+      c->fail(cct, -EROFS);
+      return;
+    }
 
     ldout(cct, 20) << "  parent overlap " << overlap << dendl;
 
@@ -2979,7 +2957,6 @@ reprotect_and_return_err:
 			       &ictx->layout, off, mylen, 0, extents);
     }
 
-    c->get();
     c->init_time(ictx, AIO_TYPE_WRITE);
     for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
       ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
@@ -3008,35 +2985,35 @@ reprotect_and_return_err:
 				     objectx, object_overlap,
 				     bl, snapc, snap_id, req_comp);
 	c->add_request();
-	r = req->send();
-	if (r < 0)
-	  goto done;
+	req->send();
       }
     }
-  done:
+
     c->finish_adding_requests(ictx->cct);
     c->put();
 
     ictx->perfcounter->inc(l_librbd_aio_wr);
     ictx->perfcounter->inc(l_librbd_aio_wr_bytes, mylen);
-
-    /* FIXME: cleanup all the allocated stuff */
-    return r;
   }
 
-  int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
+  void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
   {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "aio_discard " << ictx << " off = " << off << " len = "
 		   << len << dendl;
 
+    c->get();
     int r = ictx_check(ictx);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      c->fail(cct, r);
+      return;
+    }
 
     r = clip_io(ictx, off, &len);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      c->fail(cct, r);
+      return;
+    }
 
     // TODO: check for snap
     ictx->snap_lock.get_read();
@@ -3048,8 +3025,10 @@ reprotect_and_return_err:
     ictx->parent_lock.put_read();
     ictx->snap_lock.put_read();
 
-    if (snap_id != CEPH_NOSNAP || ictx->read_only)
-      return -EROFS;
+    if (snap_id != CEPH_NOSNAP || ictx->read_only) {
+      c->fail(cct, -EROFS);
+      return;
+    }
 
     // map
     vector<ObjectExtent> extents;
@@ -3058,7 +3037,6 @@ reprotect_and_return_err:
 			       &ictx->layout, off, len, 0, extents);
     }
 
-    c->get();
     c->init_time(ictx, AIO_TYPE_DISCARD);
     for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
       ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
@@ -3089,12 +3067,10 @@ reprotect_and_return_err:
 			  snapc, snap_id, req_comp);
       }
 
-      r = req->send();
-      if (r < 0)
-	goto done;
+      req->send();
     }
+
     r = 0;
-  done:
     if (ictx->object_cacher) {
       Mutex::Locker l(ictx->cache_lock);
       ictx->object_cacher->discard_set(ictx->object_set, extents);
@@ -3105,9 +3081,6 @@ reprotect_and_return_err:
 
     ictx->perfcounter->inc(l_librbd_aio_discard);
     ictx->perfcounter->inc(l_librbd_aio_discard_bytes, len);
-
-    /* FIXME: cleanup all the allocated stuff */
-    return r;
   }
 
   void rbd_req_cb(completion_t cb, void *arg)
@@ -3117,23 +3090,27 @@ reprotect_and_return_err:
     req->complete(comp->get_return_value());
   }
 
-  int aio_read(ImageCtx *ictx, uint64_t off, size_t len,
+  void aio_read(ImageCtx *ictx, uint64_t off, size_t len,
 	       char *buf, bufferlist *bl,
 	       AioCompletion *c)
   {
     vector<pair<uint64_t,uint64_t> > image_extents(1);
     image_extents[0] = make_pair(off, len);
-    return aio_read(ictx, image_extents, buf, bl, c);
+    aio_read(ictx, image_extents, buf, bl, c);
   }
 
-  int aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
-	       char *buf, bufferlist *pbl, AioCompletion *c)
+  void aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
+	        char *buf, bufferlist *pbl, AioCompletion *c)
   {
-    ldout(ictx->cct, 20) << "aio_read " << ictx << " completion " << c << " " << image_extents << dendl;
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "aio_read " << ictx << " completion " << c << " " << image_extents << dendl;
 
+    c->get();
     int r = ictx_check(ictx);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      c->fail(cct, r);
+      return;
+    }
 
     ictx->snap_lock.get_read();
     snap_t snap_id = ictx->snap_id;
@@ -3148,8 +3125,10 @@ reprotect_and_return_err:
 	 ++p) {
       uint64_t len = p->second;
       r = clip_io(ictx, p->first, &len);
-      if (r < 0)
-	return r;
+      if (r < 0) {
+        c->fail(cct, r);
+	return;
+      }
       if (len == 0)
 	continue;
 
@@ -3158,13 +3137,10 @@ reprotect_and_return_err:
       buffer_ofs += len;
     }
 
-    int64_t ret;
-
     c->read_buf = buf;
     c->read_buf_len = buffer_ofs;
     c->read_bl = pbl;
 
-    c->get();
     c->init_time(ictx, AIO_TYPE_READ);
     for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin(); p != object_extents.end(); ++p) {
       for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
@@ -3185,25 +3161,16 @@ reprotect_and_return_err:
 				    q->length, q->offset,
 				    cache_comp);
 	} else {
-	  r = req->send();
-	  if (r < 0 && r == -ENOENT)
-	    r = 0;
-	  if (r < 0) {
-	    ret = r;
-	    goto done;
-	  }
+	  req->send();
 	}
       }
     }
-    ret = buffer_ofs;
-  done:
+
     c->finish_adding_requests(ictx->cct);
     c->put();
 
     ictx->perfcounter->inc(l_librbd_aio_rd);
     ictx->perfcounter->inc(l_librbd_aio_rd_bytes, buffer_ofs);
-
-    return ret;
   }
 
   AioCompletion *aio_create_completion() {
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 1e9fd9a..7712a39 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -179,14 +179,15 @@ namespace librbd {
 	       char *buf, bufferlist *pbl);
   ssize_t write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf);
   int discard(ImageCtx *ictx, uint64_t off, uint64_t len);
-  int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
-		AioCompletion *c);
-  int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c);
-  int aio_read(ImageCtx *ictx, uint64_t off, size_t len,
-	       char *buf, bufferlist *pbl, AioCompletion *c);
-  int aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
-	       char *buf, bufferlist *pbl, AioCompletion *c);
-  int aio_flush(ImageCtx *ictx, AioCompletion *c);
+
+  void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
+		 AioCompletion *c);
+  void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c);
+  void aio_read(ImageCtx *ictx, uint64_t off, size_t len,
+	        char *buf, bufferlist *pbl, AioCompletion *c);
+  void aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
+	        char *buf, bufferlist *pbl, AioCompletion *c);
+  void aio_flush(ImageCtx *ictx, AioCompletion *c);
   int flush(ImageCtx *ictx);
   int _flush(ImageCtx *ictx);
   int invalidate_cache(ImageCtx *ictx);
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index 658f24b..244a5a0 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -20,6 +20,7 @@
 #include "common/errno.h"
 #include "common/snap_types.h"
 #include "common/perf_counters.h"
+#include "common/WorkQueue.h"
 #include "include/Context.h"
 #include "include/rbd/librbd.hpp"
 #include "osdc/ObjectCacher.h"
@@ -45,6 +46,117 @@ using ceph::bufferlist;
 using librados::snap_t;
 using librados::IoCtx;
 
+namespace {
+
+class C_AioReadWQ : public Context {
+public:
+  C_AioReadWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+              char *buf, bufferlist *pbl, librbd::AioCompletion *c)
+    : m_ictx(ictx), m_off(off), m_len(len), m_buf(buf), m_pbl(pbl), m_comp(c) {
+  }
+protected:
+  virtual void finish(int r) {
+    librbd::aio_read(m_ictx, m_off, m_len, m_buf, m_pbl, m_comp);
+  }
+private:
+  librbd::ImageCtx *m_ictx;
+  uint64_t m_off;
+  uint64_t m_len;
+  char *m_buf;
+  bufferlist *m_pbl;
+  librbd::AioCompletion *m_comp;
+};
+
+class C_AioWriteWQ : public Context {
+public:
+  C_AioWriteWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+               const char *buf, librbd::AioCompletion *c)
+    : m_ictx(ictx), m_off(off), m_len(len), m_buf(buf), m_comp(c) {
+  }
+protected:
+  virtual void finish(int r) {
+    librbd::aio_write(m_ictx, m_off, m_len, m_buf, m_comp);
+  }
+private:
+  librbd::ImageCtx *m_ictx;
+  uint64_t m_off;
+  uint64_t m_len;
+  const char *m_buf;
+  librbd::AioCompletion *m_comp;
+};
+
+class C_AioDiscardWQ : public Context {
+public:
+  C_AioDiscardWQ(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
+                 librbd::AioCompletion *c)
+    : m_ictx(ictx), m_off(off), m_len(len), m_comp(c) {
+  }
+protected:
+  virtual void finish(int r) {
+    librbd::aio_discard(m_ictx, m_off, m_len, m_comp);
+  }
+private:
+  librbd::ImageCtx *m_ictx;
+  uint64_t m_off;
+  uint64_t m_len;
+  librbd::AioCompletion *m_comp;
+};
+
+class C_AioFlushWQ : public Context {
+public:
+  C_AioFlushWQ(librbd::ImageCtx *ictx, librbd::AioCompletion *c)
+    : m_ictx(ictx), m_comp(c) {
+  }
+protected:
+  virtual void finish(int r) {
+    librbd::aio_flush(m_ictx, m_comp);
+  }
+private:
+  librbd::ImageCtx *m_ictx;
+  librbd::AioCompletion *m_comp;
+};
+
+void submit_aio_read(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+                     char *buf, bufferlist *pbl, librbd::AioCompletion *c) {
+  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+    ictx->aio_work_queue->queue(new C_AioReadWQ(ictx, off, len, buf, pbl, c));
+  } else {
+    librbd::aio_read(ictx, off, len, buf, pbl, c);
+  }
+}
+
+void submit_aio_write(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+                      const char *buf, librbd::AioCompletion *c) {
+  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+    ictx->aio_work_queue->queue(new C_AioWriteWQ(ictx, off, len, buf, c));
+  } else {
+    librbd::aio_write(ictx, off, len, buf, c);
+  }
+}
+
+void submit_aio_discard(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
+                        librbd::AioCompletion *c) {
+  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+    ictx->aio_work_queue->queue(new C_AioDiscardWQ(ictx, off, len, c));
+  } else {
+    librbd::aio_discard(ictx, off, len, c);
+  }
+}
+
+void submit_aio_flush(librbd::ImageCtx *ictx, librbd::AioCompletion *c) {
+  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+    ictx->aio_work_queue->queue(new C_AioFlushWQ(ictx, c));
+  } else {
+    librbd::aio_flush(ictx, c);
+  }
+}
+
+librbd::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) {
+  return reinterpret_cast<librbd::AioCompletion *>(comp->pc);
+}
+
+} // anonymous namespace
+
 namespace librbd {
   ProgressContext::~ProgressContext()
   {
@@ -483,14 +595,15 @@ namespace librbd {
     ImageCtx *ictx = (ImageCtx *)ctx;
     if (bl.length() < len)
       return -EINVAL;
-    return librbd::aio_write(ictx, off, len, bl.c_str(),
-			     (librbd::AioCompletion *)c->pc);
+    submit_aio_write(ictx, off, len, bl.c_str(), get_aio_completion(c));
+    return 0;
   }
 
   int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
-    return librbd::aio_discard(ictx, off, len, (librbd::AioCompletion *)c->pc);
+    submit_aio_discard(ictx, off, len, get_aio_completion(c));
+    return 0;
   }
 
   int Image::aio_read(uint64_t off, size_t len, bufferlist& bl,
@@ -499,7 +612,8 @@ namespace librbd {
     ImageCtx *ictx = (ImageCtx *)ctx;
     ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
 			 << (void *)(bl.c_str() + len - 1) << dendl;
-    return librbd::aio_read(ictx, off, len, NULL, &bl, (librbd::AioCompletion *)c->pc);
+    submit_aio_read(ictx, off, len, NULL, &bl, get_aio_completion(c));
+    return 0;
   }
 
   int Image::flush()
@@ -511,7 +625,8 @@ namespace librbd {
   int Image::aio_flush(RBD::AioCompletion *c)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
-    return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
+    submit_aio_flush(ictx, get_aio_completion(c));
+    return 0;
   }
 
   int Image::invalidate_cache()
@@ -1102,8 +1217,8 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
-  return librbd::aio_write(ictx, off, len, buf,
-			   (librbd::AioCompletion *)comp->pc);
+  submit_aio_write(ictx, off, len, buf, get_aio_completion(comp));
+  return 0;
 }
 
 extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
@@ -1111,7 +1226,8 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
-  return librbd::aio_discard(ictx, off, len, (librbd::AioCompletion *)comp->pc);
+  submit_aio_discard(ictx, off, len, get_aio_completion(comp));
+  return 0;
 }
 
 extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
@@ -1119,8 +1235,8 @@ extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
-  return librbd::aio_read(ictx, off, len, buf, NULL,
-			  (librbd::AioCompletion *)comp->pc);
+  submit_aio_read(ictx, off, len, buf, NULL, get_aio_completion(comp));
+  return 0;
 }
 
 extern "C" int rbd_flush(rbd_image_t image)
@@ -1133,7 +1249,8 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
-  return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
+  submit_aio_flush(ictx, get_aio_completion(comp));
+  return 0;
 }
 
 extern "C" int rbd_invalidate_cache(rbd_image_t image)
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 37bb4ef..392b2c0 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -112,6 +112,7 @@ void Log::set_log_file(string fn)
 
 void Log::reopen_log_file()
 {
+  pthread_mutex_lock(&m_flush_mutex);
   if (m_fd >= 0)
     VOID_TEMP_FAILURE_RETRY(::close(m_fd));
   if (m_log_file.length()) {
@@ -119,6 +120,7 @@ void Log::reopen_log_file()
   } else {
     m_fd = -1;
   }
+  pthread_mutex_unlock(&m_flush_mutex);
 }
 
 void Log::set_syslog_level(int log, int crash)
diff --git a/src/mds/MDSUtility.cc b/src/mds/MDSUtility.cc
index 09be280..65cf5c6 100644
--- a/src/mds/MDSUtility.cc
+++ b/src/mds/MDSUtility.cc
@@ -25,7 +25,7 @@ MDSUtility::MDSUtility() :
   waiting_for_mds_map(NULL)
 {
   monc = new MonClient(g_ceph_context);
-  messenger = Messenger::create(g_ceph_context, entity_name_t::CLIENT(), "mds", getpid());
+  messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
   mdsmap = new MDSMap();
   osdmap = new OSDMap();
   objecter = new Objecter(g_ceph_context, messenger, monc, osdmap, lock, timer, 0, 0);
diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h
index bfe7775..e59b03c 100644
--- a/src/messages/MOSDBoot.h
+++ b/src/messages/MOSDBoot.h
@@ -22,7 +22,7 @@
 
 class MOSDBoot : public PaxosServiceMessage {
 
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 2;
 
  public:
@@ -31,21 +31,24 @@ class MOSDBoot : public PaxosServiceMessage {
   entity_addr_t cluster_addr;
   epoch_t boot_epoch;  // last epoch this daemon was added to the map (if any)
   map<string,string> metadata; ///< misc metadata about this osd
+  uint64_t osd_features;
 
   MOSDBoot()
     : PaxosServiceMessage(MSG_OSD_BOOT, 0, HEAD_VERSION, COMPAT_VERSION),
-      boot_epoch(0)
+      boot_epoch(0), osd_features(0)
   { }
   MOSDBoot(OSDSuperblock& s, epoch_t be,
 	   const entity_addr_t& hb_back_addr_ref,
 	   const entity_addr_t& hb_front_addr_ref,
-           const entity_addr_t& cluster_addr_ref)
+           const entity_addr_t& cluster_addr_ref,
+	   uint64_t feat)
     : PaxosServiceMessage(MSG_OSD_BOOT, s.current_epoch, HEAD_VERSION, COMPAT_VERSION),
       sb(s),
       hb_back_addr(hb_back_addr_ref),
       hb_front_addr(hb_front_addr_ref),
       cluster_addr(cluster_addr_ref),
-      boot_epoch(be)
+      boot_epoch(be),
+      osd_features(feat)
   { }
   
 private:
@@ -54,7 +57,9 @@ private:
 public:
   const char *get_type_name() const { return "osd_boot"; }
   void print(ostream& out) const {
-    out << "osd_boot(osd." << sb.whoami << " booted " << boot_epoch << " v" << version << ")";
+    out << "osd_boot(osd." << sb.whoami << " booted " << boot_epoch
+	<< " features " << osd_features
+	<< " v" << version << ")";
   }
   
   void encode_payload(uint64_t features) {
@@ -65,6 +70,7 @@ public:
     ::encode(boot_epoch, payload);
     ::encode(hb_front_addr, payload);
     ::encode(metadata, payload);
+    ::encode(osd_features, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
@@ -79,6 +85,10 @@ public:
       ::decode(hb_front_addr, p);
     if (header.version >= 5)
       ::decode(metadata, p);
+    if (header.version >= 6)
+      ::decode(osd_features, p);
+    else
+      osd_features = 0;
   }
 };
 
diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc
index 359c2e1..9e074e2 100644
--- a/src/mon/AuthMonitor.cc
+++ b/src/mon/AuthMonitor.cc
@@ -339,7 +339,8 @@ uint64_t AuthMonitor::assign_global_id(MAuth *m, bool should_increase_max)
 
   // bump the max?
   while (mon->is_leader() &&
-	 next_global_id >= max_global_id - g_conf->mon_globalid_prealloc / 2) {
+	 (max_global_id < g_conf->mon_globalid_prealloc ||
+	  next_global_id >= max_global_id - g_conf->mon_globalid_prealloc / 2)) {
     increase_max_global_id();
   }
 
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 2ee10f7..debe58c 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -78,8 +78,15 @@ void Elector::start()
   init();
   
   // start by trying to elect me
-  if (epoch % 2 == 0) 
+  if (epoch % 2 == 0) {
     bump_epoch(epoch+1);  // odd == election cycle
+  } else {
+    // do a trivial db write just to ensure it is writeable.
+    MonitorDBStore::Transaction t;
+    t.put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
+    int r = mon->store->apply_transaction(t);
+    assert(r >= 0);
+  }
   start_stamp = ceph_clock_now(g_ceph_context);
   electing_me = true;
   acked_me[mon->rank] = CEPH_FEATURES_ALL;
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 12755b7..b05b8e8 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -208,17 +208,16 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
 
   // check privileges, ignore if fails
   MonSession *session = m->get_session();
-  if (!session)
-    goto out;
+  assert(session);
   if (!session->is_capable("mds", MON_CAP_X)) {
     dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
 	    << session->caps << dendl;
-    goto out;
+    goto ignore;
   }
 
   if (m->get_fsid() != mon->monmap->fsid) {
     dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
-    goto out;
+    goto ignore;
   }
 
   dout(12) << "preprocess_beacon " << *m
@@ -229,13 +228,13 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
   // make sure the address has a port
   if (m->get_orig_source_addr().get_port() == 0) {
     dout(1) << " ignoring boot message without a port" << dendl;
-    goto out;
+    goto ignore;
   }
 
   // check compat
   if (!m->get_compat().writeable(mdsmap.compat)) {
     dout(1) << " mds " << m->get_source_inst() << " can't write to mdsmap " << mdsmap.compat << dendl;
-    goto out;
+    goto ignore;
   }
 
   // fw to leader?
@@ -244,7 +243,7 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
 
   if (pending_mdsmap.test_flag(CEPH_MDSMAP_DOWN)) {
     dout(7) << " mdsmap DOWN flag set, ignoring mds " << m->get_source_inst() << " beacon" << dendl;
-    goto out;
+    goto ignore;
   }
 
   // booted, but not in map?
@@ -252,7 +251,8 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
     if (state != MDSMap::STATE_BOOT) {
       dout(7) << "mds_beacon " << *m << " is not in mdsmap" << dendl;
       mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
-      goto out;
+      m->put();
+      return true;
     } else {
       return false;  // not booted yet.
     }
@@ -262,13 +262,13 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
   // old seq?
   if (info.state_seq > seq) {
     dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
-    goto out;
+    goto ignore;
   }
 
   if (mdsmap.get_epoch() != m->get_last_epoch_seen()) {
     dout(10) << "mds_beacon " << *m
 	     << " ignoring requested state, because mds hasn't seen latest map" << dendl;
-    goto ignore;
+    goto reply;
   }
 
   if (info.laggy()) {
@@ -277,17 +277,17 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
   }
   if (state == MDSMap::STATE_BOOT) {
     // ignore, already booted.
-    goto out;
+    goto ignore;
   }
   // is there a state change here?
-  if (info.state != state) {    
+  if (info.state != state) {
     // legal state change?
     if ((info.state == MDSMap::STATE_STANDBY ||
 	 info.state == MDSMap::STATE_STANDBY_REPLAY ||
 	 info.state == MDSMap::STATE_ONESHOT_REPLAY) && state > 0) {
       dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
 	       << " -> " << ceph_mds_state_name(state) << ")" << dendl;
-      goto ignore;
+      goto reply;
     }
     
     if (info.state == MDSMap::STATE_STANDBY &&
@@ -299,21 +299,24 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
       dout(10) << "mds_beacon can't standby-replay mds." << m->get_standby_for_rank() << " at this time (cluster degraded, or mds not active)" << dendl;
       dout(10) << "pending_mdsmap.is_degraded()==" << pending_mdsmap.is_degraded()
           << " rank state: " << ceph_mds_state_name(pending_mdsmap.get_state(m->get_standby_for_rank())) << dendl;
-      goto ignore;
+      goto reply;
     }
     _note_beacon(m);
     return false;  // need to update map
   }
 
- ignore:
+ reply:
   // note time and reply
   _note_beacon(m);
   mon->send_reply(m,
 		  new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
 				 mdsmap.get_epoch(), state, seq));
-  
-  // done
- out:
+  m->put();
+  return true;
+
+ ignore:
+  // I won't reply this beacon, drop it.
+  mon->no_reply(m);
   m->put();
   return true;
 }
@@ -525,8 +528,14 @@ void MDSMonitor::_updated(MMDSBeacon *m)
   if (m->get_state() == MDSMap::STATE_STOPPED) {
     // send the map manually (they're out of the map, so they won't get it automatic)
     mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
+  } else {
+    mon->send_reply(m, new MMDSBeacon(mon->monmap->fsid,
+				      m->get_global_id(),
+				      m->get_name(),
+				      mdsmap.get_epoch(),
+				      m->get_state(),
+				      m->get_seq()));
   }
-
   m->put();
 }
 
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index 901e93e..230f0ac 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -56,10 +56,12 @@ class MDSMonitor : public PaxosService {
     void finish(int r) {
       if (r >= 0)
 	mm->_updated(m);   // success
-      else if (r == -ECANCELED)
+      else if (r == -ECANCELED) {
+	mm->mon->no_reply(m);
 	m->put();
-      else
+      } else {
 	mm->dispatch((PaxosServiceMessage*)m);        // try again
+      }
     }
   };
 
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 3a6dda4..0272dfd 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -12,7 +12,7 @@
  * 
  */
 
-#include "msg/SimpleMessenger.h"
+#include "msg/Messenger.h"
 #include "messages/MMonGetMap.h"
 #include "messages/MMonGetVersion.h"
 #include "messages/MMonGetVersionReply.h"
@@ -114,11 +114,9 @@ int MonClient::get_monmap_privately()
   Mutex::Locker l(monc_lock);
 
   bool temp_msgr = false;
-  SimpleMessenger* smessenger = NULL;
+  Messenger* smessenger = NULL;
   if (!messenger) {
-    messenger = smessenger = new SimpleMessenger(cct,
-                                                 entity_name_t::CLIENT(-1),
-                                                 "temp_mon_client", getpid());
+    messenger = smessenger = Messenger::create_client_messenger(cct, "temp_mon_client");
     messenger->add_dispatcher_head(this);
     smessenger->start();
     temp_msgr = true;
@@ -213,9 +211,7 @@ int MonClient::ping_monitor(const string &mon_id, string *result_reply)
 
   MonClientPinger *pinger = new MonClientPinger(cct, result_reply);
 
-  Messenger *smsgr = new SimpleMessenger(cct,
-                                         entity_name_t::CLIENT(-1),
-                                         "temp_ping_client", getpid());
+  Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client");
   smsgr->add_dispatcher_head(pinger);
   smsgr->start();
 
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 461b3f2..d0908cb 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -564,11 +564,11 @@ COMMAND("osd pool rename " \
 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote", \
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote " \
 	"name=val,type=CephString " \
 	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 2240e41..339fd3e 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -642,7 +642,7 @@ void Monitor::refresh_from_paxos(bool *need_bootstrap)
     paxos_service[i]->refresh(need_bootstrap);
   }
   for (int i = 0; i < PAXOS_NUM; ++i) {
-    paxos_service[i]->post_paxos_update();
+    paxos_service[i]->post_refresh();
   }
 }
 
@@ -1925,30 +1925,6 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
 
   health_monitor->get_health(f, summary, (detailbl ? &detail : NULL));
 
-  if (f)
-    f->open_array_section("summary");
-  stringstream ss;
-  health_status_t overall = HEALTH_OK;
-  if (!summary.empty()) {
-    ss << ' ';
-    while (!summary.empty()) {
-      if (overall > summary.front().first)
-	overall = summary.front().first;
-      ss << summary.front().second;
-      if (f) {
-        f->open_object_section("item");
-        f->dump_stream("severity") <<  summary.front().first;
-        f->dump_string("summary", summary.front().second);
-        f->close_section();
-      }
-      summary.pop_front();
-      if (!summary.empty())
-	ss << "; ";
-    }
-  }
-  if (f)
-    f->close_section();
-
   if (f) {
     f->open_object_section("timechecks");
     f->dump_int("epoch", get_epoch());
@@ -1957,6 +1933,8 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
       << ((timecheck_round%2) ? "on-going" : "finished");
   }
 
+  stringstream ss;
+  health_status_t overall = HEALTH_OK;
   if (!timecheck_skews.empty()) {
     list<string> warns;
     if (f)
@@ -2003,6 +1981,7 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
         if (!warns.empty())
           ss << ",";
       }
+      summary.push_back(make_pair(HEALTH_WARN, "Monitor clock skew detected "));
     }
     if (f)
       f->close_section();
@@ -2010,6 +1989,28 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
   if (f)
     f->close_section();
 
+  if (f)
+    f->open_array_section("summary");
+  if (!summary.empty()) {
+    ss << ' ';
+    while (!summary.empty()) {
+      if (overall > summary.front().first)
+	overall = summary.front().first;
+      ss << summary.front().second;
+      if (f) {
+        f->open_object_section("item");
+        f->dump_stream("severity") <<  summary.front().first;
+        f->dump_string("summary", summary.front().second);
+        f->close_section();
+      }
+      summary.pop_front();
+      if (!summary.empty())
+        ss << "; ";
+    }
+  }
+  if (f)
+    f->close_section();
+
   stringstream fss;
   fss << overall;
   status = fss.str() + ss.str();
@@ -2577,7 +2578,8 @@ void Monitor::forward_request_leader(PaxosServiceMessage *req)
     routed_requests[rr->tid] = rr;
     session->routed_request_tids.insert(rr->tid);
     
-    dout(10) << "forward_request " << rr->tid << " request " << *req << dendl;
+    dout(10) << "forward_request " << rr->tid << " request " << *req
+	     << " features " << rr->con_features << dendl;
 
     MForward *forward = new MForward(rr->tid, req,
 				     rr->con_features,
@@ -2934,12 +2936,11 @@ bool Monitor::_ms_dispatch(Message *m)
     dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
   }
 
-  if (s) {
-    if (s->auth_handler) {
-      s->entity_name = s->auth_handler->get_entity_name();
-    }
-    dout(20) << " caps " << s->caps.get_str() << dendl;
+  assert(s);
+  if (s->auth_handler) {
+    s->entity_name = s->auth_handler->get_entity_name();
   }
+  dout(20) << " caps " << s->caps.get_str() << dendl;
 
   if (is_synchronizing() && !src_is_mon) {
     waitlist_or_zap_client(m);
@@ -2947,11 +2948,7 @@ bool Monitor::_ms_dispatch(Message *m)
   }
 
   ret = dispatch(s, m, src_is_mon);
-
-  if (s) {
-    s->put();
-  }
-
+  s->put();
   return ret;
 }
 
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 1576db7..f16770c 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -283,6 +283,8 @@ class MonitorDBStore
 	  db->compact_range_async(compact.front().first, compact.front().second.first, compact.front().second.second);
 	compact.pop_front();
       }
+    } else {
+      assert(0 == "failed to write to db");
     }
     return r;
   }
@@ -390,11 +392,15 @@ class MonitorDBStore
 
     virtual pair<string,string> get_next_key() {
       assert(iter->valid());
-      pair<string,string> r = iter->raw_key();
-      do {
-	iter->next();
-      } while (iter->valid() && sync_prefixes.count(iter->raw_key().first) == 0);
-      return r;
+
+      for (; iter->valid(); iter->next()) {
+        pair<string,string> r = iter->raw_key();
+        if (sync_prefixes.count(r.first) > 0) {
+          iter->next();
+          return r;
+        }
+      }
+      return pair<string,string>();
     }
 
     virtual bool _is_valid() {
@@ -506,7 +512,8 @@ class MonitorDBStore
     for (iter = prefixes.begin(); iter != prefixes.end(); ++iter) {
       dbt->rmkeys_by_prefix((*iter));
     }
-    db->submit_transaction_sync(dbt);
+    int r = db->submit_transaction_sync(dbt);
+    assert(r >= 0);
   }
 
   void init_options() {
diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc
index db21a94..afaddab 100644
--- a/src/mon/MonitorStore.cc
+++ b/src/mon/MonitorStore.cc
@@ -437,7 +437,13 @@ void MonitorStore::put_bl_sn_map(const char *a,
     derr << "failed to open " << dir << ": " << cpp_strerror(err) << dendl;
     assert(0 == "failed to open temp file");
   }
-  sync_filesystem(dirfd);
+
+  err = sync_filesystem(dirfd);
+  if (err < 0) {
+    derr << "sync_filesystem error " << cpp_strerror(err) << dendl;
+    assert(0 == "failed to sync_filesystem");
+  }
+
   close_err = TEMP_FAILURE_RETRY(::close(dirfd));
   assert (0 == close_err);
     
@@ -481,7 +487,13 @@ void MonitorStore::sync()
 	 << ": " << cpp_strerror(err) << dendl;
     assert(0 == "failed to open dir for syncing");
   }
-  sync_filesystem(dirfd);
+
+  int ret = sync_filesystem(dirfd);
+  if (ret < 0) {
+    derr << __func__ << " sync_filesystem error " << cpp_strerror(ret) << dendl;
+    assert(0 == "failed to sync_filesystem");
+  }
+
   int close_err = TEMP_FAILURE_RETRY(::close(dirfd));
   assert (0 == close_err);
 }
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 510b727..599de08 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -66,6 +66,12 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) {
 		<< ").osd e" << osdmap.get_epoch() << " ";
 }
 
+OSDMonitor::OSDMonitor(Monitor *mn, Paxos *p, string service_name)
+  : PaxosService(mn, p, service_name),
+    inc_osd_cache(g_conf->mon_osd_cache_size),
+    full_osd_cache(g_conf->mon_osd_cache_size),
+    thrash_map(0), thrash_last_up_osd(-1) { }
+
 bool OSDMonitor::_have_pending_crush()
 {
   return pending_inc.crush.length();
@@ -1349,8 +1355,13 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
 	xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
       dout(10) << " laggy, now xi " << xi << dendl;
     }
+
     // set features shared by the osd
-    xi.features = m->get_connection()->get_features();
+    if (m->osd_features)
+      xi.features = m->osd_features;
+    else
+      xi.features = m->get_connection()->get_features();
+
     pending_inc.new_xinfo[from] = xi;
 
     // wait
@@ -1765,6 +1776,29 @@ void OSDMonitor::send_incremental(epoch_t first, entity_inst_t& dest, bool oneti
   }
 }
 
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+    if (inc_osd_cache.lookup(ver, &bl)) {
+      return 0;
+    }
+    int ret = PaxosService::get_version(ver, bl);
+    if (!ret) {
+      inc_osd_cache.add(ver, bl);
+    }
+    return ret;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+    if (full_osd_cache.lookup(ver, &bl)) {
+      return 0;
+    }
+    int ret = PaxosService::get_version_full(ver, bl);
+    if (!ret) {
+      full_osd_cache.add(ver, bl);
+    }
+    return ret;
+}
 
 
 
@@ -2476,8 +2510,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
   } else if (prefix == "osd crush get-tunable") {
     string tunable;
     cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
-    int value;
-    cmd_getval(g_ceph_context, cmdmap, "value", value);
     ostringstream rss;
     if (f)
       f->open_object_section("tunable");
@@ -2587,6 +2619,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
         f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
       } else if (var == "erasure_code_profile") {
        f->dump_string("erasure_code_profile", p->erasure_code_profile);
+      } else if (var == "min_read_recency_for_promote") {
+	f->dump_int("min_read_recency_for_promote", p->min_read_recency_for_promote);
       }
 
       f->close_section();
@@ -2636,6 +2670,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
         ss << "cache_min_evict_age: " << p->cache_min_evict_age;
       } else if (var == "erasure_code_profile") {
        ss << "erasure_code_profile: " << p->erasure_code_profile;
+      } else if (var == "min_read_recency_for_promote") {
+	ss << "min_read_recency_for_promote: " << p->min_read_recency_for_promote;
       }
 
       rdata.append(ss);
@@ -3039,7 +3075,7 @@ void OSDMonitor::get_pools_health(
       } else if (warn_threshold > 0 &&
 		 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
         ss << "pool '" << pool_name
-           << "' has " << si_t(sum.num_bytes) << " objects"
+           << "' has " << si_t(sum.num_bytes) << " bytes"
            << " (max " << si_t(pool.quota_max_bytes) << ")";
         status = HEALTH_WARN;
       }
@@ -3752,6 +3788,12 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       return -EINVAL;
     }
     p.cache_min_evict_age = n;
+  } else if (var == "min_read_recency_for_promote") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.min_read_recency_for_promote = n;
   } else {
     ss << "unrecognized variable '" << var << "'";
     return -EINVAL;
@@ -4834,7 +4876,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     }
     if (osdmap.exists(id)) {
       pending_inc.new_weight[id] = ww;
-      ss << "reweighted osd." << id << " to " << w << " (" << ios::hex << ww << ios::dec << ")";
+      ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
       getline(ss, rs);
       wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
 						get_last_committed() + 1));
@@ -5310,6 +5352,17 @@ done:
       err = -ENOTEMPTY;
       goto reply;
     }
+    if (tp->ec_pool()) {
+      ss << "tier pool '" << tierpoolstr
+	 << "' is an ec pool, which cannot be a tier";
+      err = -ENOTSUP;
+      goto reply;
+    }
+    if (!tp->removed_snaps.empty() || !tp->snaps.empty()) {
+      ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
+      err = -ENOTEMPTY;
+      goto reply;
+    }
     // go
     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
@@ -5637,6 +5690,7 @@ done:
     ntp->cache_mode = mode;
     ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
     ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
+    ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
     ntp->hit_set_params = hsp;
     ntp->target_max_bytes = size;
     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
@@ -5990,6 +6044,12 @@ int OSDMonitor::_check_remove_pool(int64_t pool, const pg_pool_t *p,
     }
     return -EBUSY;
   }
+
+  if (!g_conf->mon_allow_pool_delete) {
+    *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
+    return -EPERM;
+  }
+
   *ss << "pool '" << poolstr << "' removed";
   return 0;
 }
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index fbce5fe..6428820 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -26,6 +26,7 @@
 using namespace std;
 
 #include "include/types.h"
+#include "common/simple_cache.hpp"
 #include "msg/Messenger.h"
 
 #include "osd/OSDMap.h"
@@ -139,6 +140,8 @@ private:
    * optimization to try to avoid sending the same inc maps twice.
    */
   map<int,epoch_t> osd_epoch;
+  SimpleLRU<version_t, bufferlist> inc_osd_cache;
+  SimpleLRU<version_t, bufferlist> full_osd_cache;
 
   void check_failures(utime_t now);
   bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
@@ -358,9 +361,7 @@ private:
   bool prepare_remove_snaps(struct MRemoveSnaps *m);
 
  public:
-  OSDMonitor(Monitor *mn, Paxos *p, string service_name)
-  : PaxosService(mn, p, service_name),
-    thrash_map(0), thrash_last_up_osd(-1) { }
+  OSDMonitor(Monitor *mn, Paxos *p, string service_name);
 
   void tick();  // check state, take actions
 
@@ -384,6 +385,9 @@ private:
     send_incremental(m, start);
   }
 
+  int get_version(version_t ver, bufferlist& bl);
+  int get_version_full(version_t ver, bufferlist& bl);
+
   epoch_t blacklist(const entity_addr_t& a, utime_t until);
 
   void dump_info(Formatter *f);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 3e0523b..c85d55b 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1000,8 +1000,8 @@ bool PGMonitor::register_new_pgs()
        ++p) {
     int64_t poolid = p->first;
     pg_pool_t &pool = p->second;
-    int ruleno = pool.get_crush_ruleset();
-    if (!osdmap->crush->rule_exists(ruleno)) 
+    int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), pool.get_size());
+    if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno))
       continue;
 
     if (pool.get_last_change() <= pg_map.last_pg_scan ||
@@ -1991,7 +1991,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       ((1000000 - p->second.cache_target_full_ratio_micro) *
        g_conf->mon_cache_target_full_warn_ratio);
     if (p->second.target_max_objects && (uint64_t)st.stats.sum.num_objects >
-	p->second.target_max_objects * ratio / 1000000) {
+	p->second.target_max_objects * (ratio / 1000000.0)) {
       nearfull = true;
       if (detail) {
 	ostringstream ss;
@@ -2003,7 +2003,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
     if (p->second.target_max_bytes && (uint64_t)st.stats.sum.num_bytes >
-	p->second.target_max_bytes * ratio / 1000000) {
+	p->second.target_max_bytes * (ratio / 1000000.0)) {
       nearfull = true;
       if (detail) {
 	ostringstream ss;
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 1b21689..7ba8e9c 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -127,6 +127,16 @@ void PaxosService::refresh(bool *need_bootstrap)
   update_from_paxos(need_bootstrap);
 }
 
+void PaxosService::post_refresh()
+{
+  dout(10) << __func__ << dendl;
+
+  post_paxos_update();
+
+  if (mon->is_peon() && !waiting_for_finished_proposal.empty()) {
+    finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+  }
+}
 
 void PaxosService::remove_legacy_versions()
 {
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 5321beb..6affd5c 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -322,6 +322,7 @@ public:
   bool dispatch(PaxosServiceMessage *m);
 
   void refresh(bool *need_bootstrap);
+  void post_refresh();
 
   /**
    * @defgroup PaxosService_h_override_funcs Functions that should be
@@ -857,7 +858,7 @@ public:
    * @param bl The bufferlist to be populated
    * @return 0 on success; <0 otherwise
    */
-  int get_version(version_t ver, bufferlist& bl) {
+  virtual int get_version(version_t ver, bufferlist& bl) {
     return mon->store->get(get_service_name(), ver, bl);
   }
   /**
@@ -867,7 +868,7 @@ public:
    * @param bl The bufferlist to be populated
    * @returns 0 on success; <0 otherwise
    */
-  int get_version_full(version_t ver, bufferlist& bl) {
+  virtual int get_version_full(version_t ver, bufferlist& bl) {
     string key = mon->store->combine_strings(full_prefix_name, ver);
     return mon->store->get(get_service_name(), key, bl);
   }
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
index b80782d..5dc69a6 100644
--- a/src/msg/Messenger.cc
+++ b/src/msg/Messenger.cc
@@ -4,6 +4,13 @@
 
 #include "SimpleMessenger.h"
 
+Messenger *Messenger::create_client_messenger(CephContext *cct, string lname)
+{
+  uint64_t nonce = 0;
+  get_random_bytes((char*)&nonce, sizeof(nonce));
+  return Messenger::create(cct, entity_name_t::CLIENT(), lname, nonce);
+}
+
 Messenger *Messenger::create(CephContext *cct,
 			     entity_name_t name,
 			     string lname,
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index 42feaf2..82ac8e6 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -148,6 +148,21 @@ public:
                            uint64_t nonce);
 
   /**
+   * create a new messenger
+   *
+   * Create a new messenger instance.
+   * Same as the above, but a slightly simpler interface for clients:
+   * - Generate a random nonce
+   * - use the default feature bits
+   * - get the messenger type from cct
+   * - use the client entity_type
+   *
+   * @param cct context
+   * @param lname logical name of the messenger in this process (e.g., "client")
+   */
+  static Messenger *create_client_messenger(CephContext *cct, string lname);
+
+  /**
    * @defgroup Accessors
    * @{
    */
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index b1d2db1..3d2fc8a 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -27,6 +27,7 @@
 #include <limits.h>
 #include <sstream>
 #include <stdio.h>
+#include <stdlib.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
@@ -296,24 +297,28 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
   if (create && g_conf->journal_zero_on_create) {
     derr << "FileJournal::_open_file : zeroing journal" << dendl;
     uint64_t write_size = 1 << 20;
-    char *buf = new char[write_size];
+    char *buf;
+    ret = ::posix_memalign((void **)&buf, block_size, write_size);
+    if (ret != 0) {
+      return ret;
+    }
     memset(static_cast<void*>(buf), 0, write_size);
     uint64_t i = 0;
     for (; (i + write_size) <= (unsigned)max_size; i += write_size) {
       ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
       if (ret < 0) {
-	delete [] buf;
+	free(buf);
 	return -errno;
       }
     }
     if (i < (unsigned)max_size) {
       ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
       if (ret < 0) {
-	delete [] buf;
+	free(buf);
 	return -errno;
       }
     }
-    delete [] buf;
+    free(buf);
   }
       
 
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index dd28f6a..a2528a5 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -1901,7 +1901,11 @@ void FileStore::_set_global_replay_guard(coll_t cid,
     return;
 
   // sync all previous operations on this sequencer
-  sync_filesystem(basedir_fd);
+  int ret = sync_filesystem(basedir_fd);
+  if (ret < 0) {
+    derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl;
+    assert(0 == "_set_global_replay_guard failed");
+  }
 
   char fn[PATH_MAX];
   get_cdir(cid, fn, sizeof(fn));
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 0354ceb..62b2ddc 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -246,6 +246,7 @@ void WBThrottle::clear_object(const ghobject_t &hoid)
 
   pending_wbs.erase(i);
   remove_object(hoid);
+  cond.Signal();
 }
 
 void WBThrottle::throttle()
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index c020c9d..e90462a 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -138,6 +138,10 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
     size -= chunk_size;
 
     r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
     if (r < 0) {
       ret = r;
       break;
@@ -201,6 +205,10 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
     size -= chunk_size;
 
     r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
     if (r < 0) {
       ret = r;
       break;
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index 39e3429..5235d4d 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -81,7 +81,7 @@ ostream &operator<<(ostream &lhs, const ECBackend::read_result_t &rhs)
   lhs << "read_result_t(r=" << rhs.r
       << ", errors=" << rhs.errors;
   if (rhs.attrs) {
-    lhs << ", attrs=" << rhs.attrs;
+    lhs << ", attrs=" << rhs.attrs.get();
   } else {
     lhs << ", noattrs";
   }
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index c0f4bdd..0befa92 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -940,8 +940,16 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
   finished_lock("OSD::finished_lock"),
   op_tracker(cct, cct->_conf->osd_enable_op_tracker),
   test_ops_hook(NULL),
-  op_wq(this, cct->_conf->osd_op_thread_timeout, &op_tp),
-  peering_wq(this, cct->_conf->osd_op_thread_timeout, &op_tp),
+  op_wq(
+    this,
+    cct->_conf->osd_op_thread_timeout,
+    cct->_conf->osd_op_thread_suicide_timeout,
+    &op_tp),
+  peering_wq(
+    this,
+    cct->_conf->osd_op_thread_timeout,
+    cct->_conf->osd_op_thread_suicide_timeout,
+    &op_tp),
   map_lock("OSD::map_lock"),
   peer_map_epoch_lock("OSD::peer_map_epoch_lock"),
   debug_drop_pg_create_probability(cct->_conf->osd_debug_drop_pg_create_probability),
@@ -953,15 +961,42 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
   pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
   osd_stat_updated(false),
   pg_stat_tid(0), pg_stat_tid_flushed(0),
-  command_wq(this, cct->_conf->osd_command_thread_timeout, &command_tp),
+  command_wq(
+    this,
+    cct->_conf->osd_command_thread_timeout,
+    cct->_conf->osd_command_thread_suicide_timeout,
+    &command_tp),
   recovery_ops_active(0),
-  recovery_wq(this, cct->_conf->osd_recovery_thread_timeout, &recovery_tp),
+  recovery_wq(
+    this,
+    cct->_conf->osd_recovery_thread_timeout,
+    cct->_conf->osd_recovery_thread_suicide_timeout,
+    &recovery_tp),
   replay_queue_lock("OSD::replay_queue_lock"),
-  snap_trim_wq(this, cct->_conf->osd_snap_trim_thread_timeout, &disk_tp),
-  scrub_wq(this, cct->_conf->osd_scrub_thread_timeout, &disk_tp),
-  scrub_finalize_wq(cct->_conf->osd_scrub_finalize_thread_timeout, &op_tp),
-  rep_scrub_wq(this, cct->_conf->osd_scrub_thread_timeout, &disk_tp),
-  remove_wq(store, cct->_conf->osd_remove_thread_timeout, &disk_tp),
+  snap_trim_wq(
+    this,
+    cct->_conf->osd_snap_trim_thread_timeout,
+    cct->_conf->osd_snap_trim_thread_suicide_timeout,
+    &disk_tp),
+  scrub_wq(
+    this,
+    cct->_conf->osd_scrub_thread_timeout,
+    cct->_conf->osd_scrub_thread_suicide_timeout,
+    &disk_tp),
+  scrub_finalize_wq(
+    cct->_conf->osd_scrub_finalize_thread_timeout, 
+    cct->_conf->osd_scrub_finalize_thread_suicide_timeout, 
+    &op_tp),
+  rep_scrub_wq(
+    this,
+    cct->_conf->osd_scrub_thread_timeout,
+    cct->_conf->osd_scrub_thread_suicide_timeout,
+    &disk_tp),
+  remove_wq(
+    store,
+    cct->_conf->osd_remove_thread_timeout,
+    cct->_conf->osd_remove_thread_suicide_timeout,
+    &disk_tp),
   next_removal_seq(0),
   service(this)
 {
@@ -3771,7 +3806,9 @@ void OSD::_send_boot()
     dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
   }
 
-  MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_back_addr, hb_front_addr, cluster_addr);
+  MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch,
+                                 hb_back_addr, hb_front_addr, cluster_addr,
+                                 CEPH_FEATURES_ALL);
   dout(10) << " client_addr " << client_messenger->get_myaddr()
 	   << ", cluster_addr " << cluster_addr
 	   << ", hb_back_addr " << hb_back_addr
@@ -3949,8 +3986,8 @@ void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epo
     return;
   }
   const entity_inst_t& peer_inst = next_osdmap->get_cluster_inst(peer);
-  Connection *peer_con = osd->cluster_messenger->get_connection(peer_inst).get();
-  osd->_share_map_outgoing(peer, peer_con, next_osdmap);
+  ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
+  osd->_share_map_outgoing(peer, peer_con.get(), next_osdmap);
   osd->cluster_messenger->send_message(m, peer_inst);
 }
 
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index daa18ca..d06f0e7 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1147,9 +1147,9 @@ private:
     map<PG*, list<OpRequestRef> > pg_for_processing;
     OSD *osd;
     PrioritizedQueue<pair<PGRef, OpRequestRef>, entity_inst_t > pqueue;
-    OpWQ(OSD *o, time_t ti, ThreadPool *tp)
+    OpWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
       : ThreadPool::WorkQueueVal<pair<PGRef, OpRequestRef>, PGRef >(
-	"OSD::OpWQ", ti, ti*10, tp),
+	"OSD::OpWQ", ti, si, tp),
 	qlock("OpWQ::qlock"),
 	osd(o),
 	pqueue(o->cct->_conf->osd_op_pq_max_tokens_per_priority,
@@ -1211,9 +1211,9 @@ private:
     list<PG*> peering_queue;
     OSD *osd;
     set<PG*> in_use;
-    PeeringWQ(OSD *o, time_t ti, ThreadPool *tp)
+    PeeringWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
       : ThreadPool::BatchWorkQueue<PG>(
-	"OSD::PeeringWQ", ti, ti*10, tp), osd(o) {}
+	"OSD::PeeringWQ", ti, si, tp), osd(o) {}
 
     void _dequeue(PG *pg) {
       for (list<PG*>::iterator i = peering_queue.begin();
@@ -1599,8 +1599,8 @@ protected:
   list<Command*> command_queue;
   struct CommandWQ : public ThreadPool::WorkQueue<Command> {
     OSD *osd;
-    CommandWQ(OSD *o, time_t ti, ThreadPool *tp)
-      : ThreadPool::WorkQueue<Command>("OSD::CommandWQ", ti, 0, tp), osd(o) {}
+    CommandWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+      : ThreadPool::WorkQueue<Command>("OSD::CommandWQ", ti, si, tp), osd(o) {}
 
     bool _empty() {
       return osd->command_queue.empty();
@@ -1653,8 +1653,8 @@ protected:
 
   struct RecoveryWQ : public ThreadPool::WorkQueue<PG> {
     OSD *osd;
-    RecoveryWQ(OSD *o, time_t ti, ThreadPool *tp)
-      : ThreadPool::WorkQueue<PG>("OSD::RecoveryWQ", ti, ti*10, tp), osd(o) {}
+    RecoveryWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+      : ThreadPool::WorkQueue<PG>("OSD::RecoveryWQ", ti, si, tp), osd(o) {}
 
     bool _empty() {
       return osd->recovery_queue.empty();
@@ -1711,8 +1711,8 @@ protected:
   
   struct SnapTrimWQ : public ThreadPool::WorkQueue<PG> {
     OSD *osd;
-    SnapTrimWQ(OSD *o, time_t ti, ThreadPool *tp)
-      : ThreadPool::WorkQueue<PG>("OSD::SnapTrimWQ", ti, 0, tp), osd(o) {}
+    SnapTrimWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+      : ThreadPool::WorkQueue<PG>("OSD::SnapTrimWQ", ti, si, tp), osd(o) {}
 
     bool _empty() {
       return osd->snap_trim_queue.empty();
@@ -1756,8 +1756,8 @@ protected:
 
   struct ScrubWQ : public ThreadPool::WorkQueue<PG> {
     OSD *osd;
-    ScrubWQ(OSD *o, time_t ti, ThreadPool *tp)
-      : ThreadPool::WorkQueue<PG>("OSD::ScrubWQ", ti, 0, tp), osd(o) {}
+    ScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+      : ThreadPool::WorkQueue<PG>("OSD::ScrubWQ", ti, si, tp), osd(o) {}
 
     bool _empty() {
       return osd->scrub_queue.empty();
@@ -1802,8 +1802,8 @@ protected:
     xlist<PG*> scrub_finalize_queue;
 
   public:
-    ScrubFinalizeWQ(time_t ti, ThreadPool *tp)
-      : ThreadPool::WorkQueue<PG>("OSD::ScrubFinalizeWQ", ti, ti*10, tp) {}
+    ScrubFinalizeWQ(time_t ti, time_t si, ThreadPool *tp)
+      : ThreadPool::WorkQueue<PG>("OSD::ScrubFinalizeWQ", ti, si, tp) {}
 
     bool _empty() {
       return scrub_finalize_queue.empty();
@@ -1847,8 +1847,8 @@ protected:
     list<MOSDRepScrub*> rep_scrub_queue;
 
   public:
-    RepScrubWQ(OSD *o, time_t ti, ThreadPool *tp)
-      : ThreadPool::WorkQueue<MOSDRepScrub>("OSD::RepScrubWQ", ti, 0, tp), osd(o) {}
+    RepScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+      : ThreadPool::WorkQueue<MOSDRepScrub>("OSD::RepScrubWQ", ti, si, tp), osd(o) {}
 
     bool _empty() {
       return rep_scrub_queue.empty();
@@ -1901,9 +1901,9 @@ protected:
     public ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> > {
     ObjectStore *&store;
     list<pair<PGRef, DeletingStateRef> > remove_queue;
-    RemoveWQ(ObjectStore *&o, time_t ti, ThreadPool *tp)
+    RemoveWQ(ObjectStore *&o, time_t ti, time_t si, ThreadPool *tp)
       : ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> >(
-	"OSD::RemoveWQ", ti, 0, tp),
+	"OSD::RemoveWQ", ti, si, tp),
 	store(o) {}
 
     bool _empty() {
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 63dc1da..810c530 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1183,13 +1183,6 @@ int OSDMap::apply_incremental(const Incremental &inc)
   if (inc.new_pool_max != -1)
     pool_max = inc.new_pool_max;
 
-  for (set<int64_t>::const_iterator p = inc.old_pools.begin();
-       p != inc.old_pools.end();
-       ++p) {
-    pools.erase(*p);
-    name_pool.erase(pool_name[*p]);
-    pool_name.erase(*p);
-  }
   for (map<int64_t,pg_pool_t>::const_iterator p = inc.new_pools.begin();
        p != inc.new_pools.end();
        ++p) {
@@ -1204,6 +1197,13 @@ int OSDMap::apply_incremental(const Incremental &inc)
     pool_name[p->first] = p->second;
     name_pool[p->second] = p->first;
   }
+  for (set<int64_t>::const_iterator p = inc.old_pools.begin();
+       p != inc.old_pools.end();
+       ++p) {
+    pools.erase(*p);
+    name_pool.erase(pool_name[*p]);
+    pool_name.erase(*p);
+  }
 
   for (map<int32_t,uint32_t>::const_iterator i = inc.new_weight.begin();
        i != inc.new_weight.end();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index b9e9b1c..4b5ab6c 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1874,7 +1874,7 @@ void PG::mark_clean()
 {
   // only mark CLEAN if we have the desired number of replicas AND we
   // are not remapped.
-  if (acting.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
+  if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
       up == acting)
     state_set(PG_STATE_CLEAN);
 
@@ -5323,12 +5323,12 @@ void PG::handle_advance_map(
 	   << dendl;
   update_osdmap_ref(osdmap);
   pool.update(osdmap);
-  if (pool.info.last_change == osdmap_ref->get_epoch())
-    on_pool_change();
   AdvMap evt(
     osdmap, lastmap, newup, up_primary,
     newacting, acting_primary);
   recovery_state.handle_event(evt, rctx);
+  if (pool.info.last_change == osdmap_ref->get_epoch())
+    on_pool_change();
 }
 
 void PG::handle_activate_map(RecoveryCtx *rctx)
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index f081055..71526a5 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -185,6 +185,18 @@ void PGLog::proc_replica_log(
   dout(10) << "proc_replica_log for osd." << from << ": "
 	   << oinfo << " " << olog << " " << omissing << dendl;
 
+  if (olog.head < log.tail) {
+    dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
+	     << "for divergent objects" << dendl;
+    return;
+  }
+  if (olog.head == log.head) {
+    dout(10) << __func__ << ": osd." << from << " same log head, not looking "
+	     << "for divergent objects" << dendl;
+    return;
+  }
+  assert(olog.head >= log.tail);
+
   /*
     basically what we're doing here is rewinding the remote log,
     dropping divergent entries, until we find something that matches
@@ -202,48 +214,54 @@ void PGLog::proc_replica_log(
 	     << " have " << i->second.have << dendl;
   }
 
-  list<pg_log_entry_t>::const_iterator fromiter = log.log.end();
-  eversion_t lower_bound = log.tail;
+  list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
+    log.log.rbegin();
   while (1) {
-    if (fromiter == log.log.begin())
+    if (first_non_divergent == log.log.rend())
       break;
-    --fromiter;
-    if (fromiter->version <= olog.head) {
-      dout(20) << "merge_log cut point (usually last shared) is "
-	       << *fromiter << dendl;
-      lower_bound = fromiter->version;
-      ++fromiter;
+    if (first_non_divergent->version <= olog.head) {
+      dout(20) << "merge_log point (usually last shared) is "
+	       << *first_non_divergent << dendl;
       break;
     }
+    ++first_non_divergent;
   }
 
+  /* Because olog.head >= log.tail, we know that both pgs must at least have
+   * the event represented by log.tail.  Thus, lower_bound >= log.tail.  It's
+   * possible that olog/log contain no actual events between olog.head and
+   * log.tail, however, since they might have been split out.  Thus, if
+   * we cannot find an event e such that log.tail <= e.version <= log.head,
+   * the last_update must actually be log.tail.
+   */
+  eversion_t lu =
+    (first_non_divergent == log.log.rend() ||
+     first_non_divergent->version < log.tail) ?
+    log.tail :
+    first_non_divergent->version;
+
   list<pg_log_entry_t> divergent;
   list<pg_log_entry_t>::const_iterator pp = olog.log.end();
-  eversion_t lu(oinfo.last_update);
   while (true) {
-    if (pp == olog.log.begin()) {
-      if (pp != olog.log.end())   // no last_update adjustment if we discard nothing!
-	lu = olog.tail;
+    if (pp == olog.log.begin())
       break;
-    }
+
     --pp;
     const pg_log_entry_t& oe = *pp;
 
     // don't continue past the tail of our log.
     if (oe.version <= log.tail) {
-      lu = oe.version;
       ++pp;
       break;
     }
 
-    if (oe.version <= lower_bound) {
-      lu = oe.version;
+    if (oe.version <= lu) {
       ++pp;
       break;
     }
 
     divergent.push_front(oe);
-  }    
+  }
 
 
   IndexedLog folog;
@@ -560,6 +578,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
     dout(10) << "merge_log extending tail to " << olog.tail << dendl;
     list<pg_log_entry_t>::iterator from = olog.log.begin();
     list<pg_log_entry_t>::iterator to;
+    eversion_t last;
     for (to = from;
 	 to != olog.log.end();
 	 ++to) {
@@ -567,12 +586,10 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
 	break;
       log.index(*to);
       dout(15) << *to << dendl;
+      last = to->version;
     }
-      
-    if (to == olog.log.end())
-      mark_dirty_to(oinfo.last_update);
-    else
-      mark_dirty_to(to->version);
+    mark_dirty_to(last);
+
     // splice into our log.
     log.log.splice(log.log.begin(),
 		   olog.log, from, to);
@@ -794,7 +811,7 @@ void PGLog::_write_log(
 
   map<string,bufferlist> keys;
   for (list<pg_log_entry_t>::iterator p = log.log.begin();
-       p != log.log.end() && p->version < dirty_to;
+       p != log.log.end() && p->version <= dirty_to;
        ++p) {
     bufferlist bl(sizeof(*p) * 2);
     p->encode_with_checksum(bl);
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index c1563f8..508b295 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -212,9 +212,9 @@ protected:
 
   /// Log is clean on [dirty_to, dirty_from)
   bool touched_log;
-  eversion_t dirty_to;         ///< must clear/writeout all keys up to dirty_to
-  eversion_t dirty_from;       ///< must clear/writeout all keys past dirty_from
-  eversion_t writeout_from;    ///< must writout keys past writeout_from
+  eversion_t dirty_to;         ///< must clear/writeout all keys <= dirty_to
+  eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
+  eversion_t writeout_from;    ///< must writout keys >= writeout_from
   set<eversion_t> trimmed;     ///< must clear keys in trimmed
   bool dirty_divergent_priors;
   CephContext *cct;
@@ -397,6 +397,19 @@ public:
     missing.split_into(child_pgid, split_bits, &(opg_log->missing));
     opg_log->mark_dirty_to(eversion_t::max());
     mark_dirty_to(eversion_t::max());
+
+    unsigned mask = ~((~0)<<split_bits);
+    for (map<eversion_t, hobject_t>::iterator i = divergent_priors.begin();
+	 i != divergent_priors.end();
+	 ) {
+      if ((i->second.hash & mask) == child_pgid.m_seed) {
+	opg_log->add_divergent_prior(i->first, i->second);
+	divergent_priors.erase(i++);
+	dirty_divergent_priors = true;
+      } else {
+	++i;
+      }
+    }
   }
 
   void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index c8fb01e..750638a 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1353,7 +1353,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
     }
   }
 
+  bool in_hit_set = false;
   if (hit_set) {
+    if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
+      in_hit_set = true;
     hit_set->insert(oid);
     if (hit_set->is_full() ||
 	hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
@@ -1366,7 +1369,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
   }
 
   if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
-      maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
+      maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false, in_hit_set))
     return;
 
   if (r) {
@@ -1535,6 +1538,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
     return;
   }
 
+  if (m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) {
+    ctx->ignore_cache = true;
+  }
+
   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
     // This object is lost. Reading from it returns an error.
     dout(20) << __func__ << ": object " << obc->obs.oi.soid
@@ -1561,7 +1568,8 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
 				      bool write_ordered,
 				      ObjectContextRef obc,
                                       int r, const hobject_t& missing_oid,
-				      bool must_promote)
+				      bool must_promote,
+				      bool in_hit_set)
 {
   if (obc)
     dout(25) << __func__ << " " << obc->obs.oi << " "
@@ -1606,7 +1614,43 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
     if (!must_promote && can_skip_promote(op, obc)) {
       return false;
     }
-    promote_object(op, obc, missing_oid);
+    if (op->may_write() || write_ordered || must_promote || !hit_set) {
+      promote_object(op, obc, missing_oid);
+    } else {
+      switch (pool.info.min_read_recency_for_promote) {
+      case 0:
+        promote_object(op, obc, missing_oid);
+        break;
+      case 1:
+        // Check if in the current hit set
+        if (in_hit_set) {
+          promote_object(op, obc, missing_oid);
+        } else {
+          do_cache_redirect(op, obc);
+        }
+        break;
+      default:
+        if (in_hit_set) {
+          promote_object(op, obc, missing_oid);
+        } else {
+          // Check if in other hit sets
+          map<time_t,HitSetRef>::iterator itor;
+          bool in_other_hit_sets = false;
+          for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end(); itor++) {
+            if (itor->second->contains(missing_oid)) {
+              in_other_hit_sets = true;
+              break;
+            }
+          }
+          if (in_other_hit_sets) {
+            promote_object(op, obc, missing_oid);
+          } else {
+            do_cache_redirect(op, obc);
+          }
+        }
+        break;
+      }
+    }
     return true;
 
   case pg_pool_t::CACHEMODE_FORWARD:
@@ -3701,6 +3745,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 
 	if (!obs.exists) {
+	  if (pool.info.require_rollback() && op.extent.offset) {
+	    result = -EOPNOTSUPP;
+	    break;
+	  }
 	  ctx->mod_desc.create();
 	} else if (op.extent.offset == oi.size) {
 	  ctx->mod_desc.append(oi.size);
@@ -3948,7 +3996,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	// Cannot delete an object with watchers
 	result = -EBUSY;
       } else {
-	result = _delete_oid(ctx, false);
+	result = _delete_oid(ctx, ctx->ignore_cache);
       }
       break;
 
@@ -5065,7 +5113,10 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
        p != ctx->notify_acks.end();
        ++p) {
-    dout(10) << "notify_ack " << make_pair(p->watch_cookie, p->notify_id) << dendl;
+    if (p->watch_cookie)
+      dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
+    else
+      dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
 	   ctx->obc->watchers.begin();
 	 i != ctx->obc->watchers.end();
@@ -5147,6 +5198,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
   dout(20) << __func__ << " " << soid << " " << ctx
 	   << " op " << pg_log_entry_t::get_op_name(log_op_type)
 	   << dendl;
+  utime_t now = ceph_clock_now(cct);
 
   // snapset
   bufferlist bss;
@@ -5208,6 +5260,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
       ctx->snapset_obc->obs.oi.version = ctx->at_version;
       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
+      ctx->snapset_obc->obs.oi.local_mtime = now;
 
       bufferlist bv(sizeof(ctx->new_obs.oi));
       ::encode(ctx->snapset_obc->obs.oi, bv);
@@ -5248,6 +5301,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
     if (ctx->mtime != utime_t()) {
       ctx->new_obs.oi.mtime = ctx->mtime;
       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
+      ctx->new_obs.oi.local_mtime = now;
     } else {
       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
     }
@@ -6040,6 +6094,10 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
   simple_repop_submit(repop);
 
   osd->logger->inc(l_osd_tier_promote);
+
+  assert(agent_state);
+  if (agent_state->is_idle())
+    agent_choose_mode();
 }
 
 void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
@@ -10952,8 +11010,10 @@ void ReplicatedPG::hit_set_persist()
   info.hit_set.current_info.end = now;
   dout(20) << __func__ << " archive " << oid << dendl;
 
-  if (agent_state)
+  if (agent_state) {
     agent_state->add_hit_set(info.hit_set.current_info.begin, hit_set);
+    hit_set_in_memory_trim();
+  }
 
   // hold a ref until it is flushed to disk
   hit_set_flushing[info.hit_set.current_info.begin] = hit_set;
@@ -11089,8 +11149,6 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
       repop->ctx->op_t->remove(oid);
       repop->ctx->log.back().mod_desc.mark_unrollbackable();
     }
-    if (agent_state)
-      agent_state->remove_oldest_hit_set();
     updated_hit_set_hist.history.pop_front();
 
     ObjectContextRef obc = get_object_context(oid, false);
@@ -11101,6 +11159,19 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
   }
 }
 
+void ReplicatedPG::hit_set_in_memory_trim()
+{
+  unsigned max = pool.info.hit_set_count;
+  unsigned max_in_memory = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0;
+
+  if (max_in_memory > max) {
+    max_in_memory = max;
+  }
+  while (agent_state->hit_set_map.size() > max_in_memory) {
+    agent_state->remove_oldest_hit_set();
+  }
+}
+
 
 // =======================================
 // cache agent
@@ -11293,6 +11364,9 @@ bool ReplicatedPG::agent_work(int start_max)
   else
     agent_state->position = next;
 
+  // Discard old in memory HitSets
+  hit_set_in_memory_trim();
+
   if (need_delay) {
     assert(agent_state->delaying == false);
     agent_delay();
@@ -11307,7 +11381,6 @@ bool ReplicatedPG::agent_work(int start_max)
 void ReplicatedPG::agent_load_hit_sets()
 {
   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
-    agent_state->discard_hit_sets();
     return;
   }
 
@@ -11378,7 +11451,14 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc)
   }
 
   utime_t now = ceph_clock_now(NULL);
-  if (obc->obs.oi.mtime + utime_t(pool.info.cache_min_flush_age, 0) > now) {
+  utime_t ob_local_mtime;
+  if (obc->obs.oi.local_mtime != utime_t()) {
+    ob_local_mtime = obc->obs.oi.local_mtime;
+  } else {
+    ob_local_mtime = obc->obs.oi.mtime;
+  }
+  bool evict_mode_full = (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
+  if (!evict_mode_full && (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
     osd->logger->inc(l_osd_agent_skip);
     return false;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 4b0d1d6..c6de400 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -439,6 +439,7 @@ public:
     bool user_modify;     // user-visible modification
     bool undirty;         // user explicitly un-dirtying this object
     bool cache_evict;     ///< true if this is a cache eviction
+    bool ignore_cache;    ///< true if IGNORE_CACHE flag is set
 
     // side effects
     list<watch_info_t> watch_connects;
@@ -541,6 +542,7 @@ public:
       op(_op), reqid(_reqid), ops(_ops), obs(_obs), snapset(0),
       new_obs(_obs->oi, _obs->exists),
       modify(false), user_modify(false), undirty(false), cache_evict(false),
+      ignore_cache(false),
       bytes_written(0), bytes_read(0), user_at_version(0),
       current_osd_subop_num(0),
       op_t(NULL),
@@ -800,6 +802,7 @@ protected:
   void hit_set_persist();   ///< persist hit info
   bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
   void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets
+  void hit_set_in_memory_trim();                     ///< discard old in memory HitSets
 
   hobject_t get_hit_set_current_object(utime_t stamp);
   hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
@@ -1054,7 +1057,8 @@ protected:
 				 bool write_ordered,
 				 ObjectContextRef obc, int r,
 				 const hobject_t& missing_oid,
-				 bool must_promote);
+				 bool must_promote,
+				 bool in_hit_set = false);
   /**
    * This helper function tells the client to redirect their request elsewhere.
    */
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index d08e9b7..5f4d660 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -807,6 +807,7 @@ void pg_pool_t::dump(Formatter *f) const
   f->close_section(); // hit_set_params
   f->dump_unsigned("hit_set_period", hit_set_period);
   f->dump_unsigned("hit_set_count", hit_set_count);
+  f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
   f->dump_unsigned("stripe_width", get_stripe_width());
 }
 
@@ -1058,8 +1059,56 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
     return;
   }
 
-  __u8 encode_compat = 5;
-  ENCODE_START(15, encode_compat, bl);
+  if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
+    // we simply added last_force_op_resend here, which is a fully
+    // backward compatible change.  however, encoding the same map
+    // differently between monitors triggers scrub noise (even though
+    // they are decodable without the feature), so let's be pendantic
+    // about it.
+    ENCODE_START(14, 5, bl);
+    ::encode(type, bl);
+    ::encode(size, bl);
+    ::encode(crush_ruleset, bl);
+    ::encode(object_hash, bl);
+    ::encode(pg_num, bl);
+    ::encode(pgp_num, bl);
+    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
+    ::encode(lpg_num, bl);
+    ::encode(lpgp_num, bl);
+    ::encode(last_change, bl);
+    ::encode(snap_seq, bl);
+    ::encode(snap_epoch, bl);
+    ::encode(snaps, bl, features);
+    ::encode(removed_snaps, bl);
+    ::encode(auid, bl);
+    ::encode(flags, bl);
+    ::encode(crash_replay_interval, bl);
+    ::encode(min_size, bl);
+    ::encode(quota_max_bytes, bl);
+    ::encode(quota_max_objects, bl);
+    ::encode(tiers, bl);
+    ::encode(tier_of, bl);
+    __u8 c = cache_mode;
+    ::encode(c, bl);
+    ::encode(read_tier, bl);
+    ::encode(write_tier, bl);
+    ::encode(properties, bl);
+    ::encode(hit_set_params, bl);
+    ::encode(hit_set_period, bl);
+    ::encode(hit_set_count, bl);
+    ::encode(stripe_width, bl);
+    ::encode(target_max_bytes, bl);
+    ::encode(target_max_objects, bl);
+    ::encode(cache_target_dirty_ratio_micro, bl);
+    ::encode(cache_target_full_ratio_micro, bl);
+    ::encode(cache_min_flush_age, bl);
+    ::encode(cache_min_evict_age, bl);
+    ::encode(erasure_code_profile, bl);
+    ENCODE_FINISH(bl);
+    return;
+  }
+
+  ENCODE_START(16, 5, bl);
   ::encode(type, bl);
   ::encode(size, bl);
   ::encode(crush_ruleset, bl);
@@ -1099,12 +1148,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(cache_min_evict_age, bl);
   ::encode(erasure_code_profile, bl);
   ::encode(last_force_op_resend, bl);
+  ::encode(min_read_recency_for_promote, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_pool_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(15, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(16, 5, 5, bl);
   ::decode(type, bl);
   ::decode(size, bl);
   ::decode(crush_ruleset, bl);
@@ -1206,6 +1256,12 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
   } else {
     last_force_op_resend = 0;
   }
+  if (struct_v >= 16) {
+    ::decode(min_read_recency_for_promote, bl);
+  } else {
+    pg_pool_t def;
+    min_read_recency_for_promote = def.min_read_recency_for_promote;
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
 }
@@ -1251,6 +1307,7 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
   a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
   a.hit_set_period = 3600;
   a.hit_set_count = 8;
+  a.min_read_recency_for_promote = 1;
   a.set_stripe_width(12345);
   a.target_max_bytes = 1238132132;
   a.target_max_objects = 1232132;
@@ -1303,6 +1360,8 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
 	<< " " << p.hit_set_period << "s"
 	<< " x" << p.hit_set_count;
   }
+  if (p.min_read_recency_for_promote)
+    out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
   out << " stripe_width " << p.get_stripe_width();
   return out;
 }
@@ -2196,6 +2255,62 @@ void pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
   o.back()->maybe_went_rw = true;
 }
 
+bool pg_interval_t::is_new_interval(
+  int old_acting_primary,
+  int new_acting_primary,
+  const vector<int> &old_acting,
+  const vector<int> &new_acting,
+  int old_up_primary,
+  int new_up_primary,
+  const vector<int> &old_up,
+  const vector<int> &new_up,
+  int old_size,
+  int new_size,
+  int old_min_size,
+  int new_min_size,
+  unsigned old_pg_num,
+  unsigned new_pg_num,
+  pg_t pgid) {
+  return old_acting_primary != new_acting_primary ||
+    new_acting != old_acting ||
+    old_up_primary != new_up_primary ||
+    new_up != old_up ||
+    old_min_size != new_min_size ||
+    old_size != new_size ||
+    pgid.is_split(old_pg_num, new_pg_num, 0);
+}
+
+bool pg_interval_t::is_new_interval(
+  int old_acting_primary,
+  int new_acting_primary,
+  const vector<int> &old_acting,
+  const vector<int> &new_acting,
+  int old_up_primary,
+  int new_up_primary,
+  const vector<int> &old_up,
+  const vector<int> &new_up,
+  OSDMapRef osdmap,
+  OSDMapRef lastmap,
+  int64_t pool_id,
+  pg_t pgid) {
+  return !(lastmap->get_pools().count(pgid.pool())) ||
+    is_new_interval(old_acting_primary,
+		    new_acting_primary,
+		    old_acting,
+		    new_acting,
+		    old_up_primary,
+		    new_up_primary,
+		    old_up,
+		    new_up,
+		    lastmap->get_pools().find(pgid.pool())->second.size,
+		    osdmap->get_pools().find(pgid.pool())->second.size,
+		    lastmap->get_pools().find(pgid.pool())->second.min_size,
+		    osdmap->get_pools().find(pgid.pool())->second.min_size,
+		    lastmap->get_pg_num(pgid.pool()),
+		    osdmap->get_pg_num(pgid.pool()),
+		    pgid);
+}
+
 bool pg_interval_t::check_new_interval(
   int old_acting_primary,
   int new_acting_primary,
@@ -2218,15 +2333,19 @@ bool pg_interval_t::check_new_interval(
   //  NOTE: a change in the up set primary triggers an interval
   //  change, even though the interval members in the pg_interval_t
   //  do not change.
-  if (old_acting_primary != new_acting_primary ||
-      new_acting != old_acting ||
-      old_up_primary != new_up_primary ||
-      new_up != old_up ||
-      (!(lastmap->get_pools().count(pool_id))) ||
-      (lastmap->get_pools().find(pool_id)->second.min_size !=
-       osdmap->get_pools().find(pool_id)->second.min_size)  ||
-      pgid.is_split(lastmap->get_pg_num(pgid.pool()),
-        osdmap->get_pg_num(pgid.pool()), 0)) {
+  if (is_new_interval(
+	old_acting_primary,
+	new_acting_primary,
+	old_acting,
+	new_acting,
+	old_up_primary,
+	new_up_primary,
+	old_up,
+	new_up,
+	osdmap,
+	lastmap,
+	pool_id,
+	pgid)) {
     pg_interval_t& i = (*past_intervals)[same_interval_since];
     i.first = same_interval_since;
     i.last = osdmap->get_epoch() - 1;
@@ -3613,6 +3732,7 @@ void object_info_t::copy_user_bits(const object_info_t& other)
   // these bits are copied from head->clone.
   size = other.size;
   mtime = other.mtime;
+  local_mtime = other.local_mtime;
   last_reqid = other.last_reqid;
   truncate_seq = other.truncate_seq;
   truncate_size = other.truncate_size;
@@ -3644,7 +3764,7 @@ void object_info_t::encode(bufferlist& bl) const
        ++i) {
     old_watchers.insert(make_pair(i->first.second, i->second));
   }
-  ENCODE_START(13, 8, bl);
+  ENCODE_START(14, 8, bl);
   ::encode(soid, bl);
   ::encode(myoloc, bl);	//Retained for compatibility
   ::encode(category, bl);
@@ -3669,6 +3789,7 @@ void object_info_t::encode(bufferlist& bl) const
   ::encode(watchers, bl);
   __u32 _flags = flags;
   ::encode(_flags, bl);
+  ::encode(local_mtime, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -3747,6 +3868,11 @@ void object_info_t::decode(bufferlist::iterator& bl)
     ::decode(_flags, bl);
     flags = (flag_t)_flags;
   }
+  if (struct_v >= 14) {
+    ::decode(local_mtime, bl);
+  } else {
+    local_mtime = utime_t();
+  }
   DECODE_FINISH(bl);
 }
 
@@ -3762,6 +3888,7 @@ void object_info_t::dump(Formatter *f) const
   f->dump_unsigned("user_version", user_version);
   f->dump_unsigned("size", size);
   f->dump_stream("mtime") << mtime;
+  f->dump_stream("local_mtime") << local_mtime;
   f->dump_unsigned("lost", (int)is_lost());
   f->dump_unsigned("flags", (int)flags);
   f->dump_stream("wrlock_by") << wrlock_by;
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index a296df0..5653039 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -973,6 +973,7 @@ public:
   HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
   uint32_t hit_set_period;      ///< periodicity of HitSet segments (seconds)
   uint32_t hit_set_count;       ///< number of periods to retain
+  uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote
 
   uint32_t stripe_width;        ///< erasure coded stripe size in bytes
 
@@ -997,6 +998,7 @@ public:
       hit_set_params(),
       hit_set_period(0),
       hit_set_count(0),
+      min_read_recency_for_promote(0),
       stripe_width(0)
   { }
 
@@ -1753,12 +1755,51 @@ struct pg_interval_t {
   static void generate_test_instances(list<pg_interval_t*>& o);
 
   /**
+   * Determines whether there is an interval change
+   */
+  static bool is_new_interval(
+    int old_acting_primary,
+    int new_acting_primary,
+    const vector<int> &old_acting,
+    const vector<int> &new_acting,
+    int old_up_primary,
+    int new_up_primary,
+    const vector<int> &old_up,
+    const vector<int> &new_up,
+    int old_size,
+    int new_size,
+    int old_min_size,
+    int new_min_size,
+    unsigned old_pg_num,
+    unsigned new_pg_num,
+    pg_t pgid
+    );
+
+  /**
+   * Determines whether there is an interval change
+   */
+  static bool is_new_interval(
+    int old_acting_primary,                     ///< [in] primary as of lastmap
+    int new_acting_primary,                     ///< [in] primary as of lastmap
+    const vector<int> &old_acting,              ///< [in] acting as of lastmap
+    const vector<int> &new_acting,              ///< [in] acting as of osdmap
+    int old_up_primary,                         ///< [in] up primary of lastmap
+    int new_up_primary,                         ///< [in] up primary of osdmap
+    const vector<int> &old_up,                  ///< [in] up as of lastmap
+    const vector<int> &new_up,                  ///< [in] up as of osdmap
+    ceph::shared_ptr<const OSDMap> osdmap,  ///< [in] current map
+    ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
+    int64_t poolid,                             ///< [in] pool for pg
+    pg_t pgid                                   ///< [in] pgid for pg
+    );
+
+  /**
    * Integrates a new map into *past_intervals, returns true
    * if an interval was closed out.
    */
   static bool check_new_interval(
     int old_acting_primary,                     ///< [in] primary as of lastmap
-    int new_acting_primary,                     ///< [in] primary as of lastmap
+    int new_acting_primary,                     ///< [in] primary as of osdmap
     const vector<int> &old_acting,              ///< [in] acting as of lastmap
     const vector<int> &new_acting,              ///< [in] acting as of osdmap
     int old_up_primary,                         ///< [in] up primary of lastmap
@@ -2621,6 +2662,7 @@ struct object_info_t {
 
   uint64_t size;
   utime_t mtime;
+  utime_t local_mtime; // local mtime
 
   // note: these are currently encoded into a total 16 bits; see
   // encode()/decode() for the weirdness.
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 649c61c..9649e27 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1220,7 +1220,7 @@ public:
   }
 };
 
-ceph_tid_t Objecter::op_submit(Op *op)
+ceph_tid_t Objecter::op_submit(Op *op, int *ctx_budget)
 {
   assert(client_lock.is_locked());
   assert(initialized);
@@ -1236,7 +1236,14 @@ ceph_tid_t Objecter::op_submit(Op *op)
 
   // throttle.  before we look at any state, because
   // take_op_budget() may drop our lock while it blocks.
-  take_op_budget(op);
+  if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {
+    int op_budget = take_op_budget(op);
+    // take and pass out the budget for the first OP
+    // in the context session
+    if (ctx_budget && (*ctx_budget == -1)) {
+      *ctx_budget = op_budget;
+    }
+  }
 
   return _op_submit(op);
 }
@@ -1439,7 +1446,7 @@ int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
   return p->raw_hash_to_pg(p->hash_key(key, ns));
 }
 
-int Objecter::calc_target(op_target_t *t, bool any_change)
+int Objecter::calc_target(op_target_t *t, epoch_t *last_force_resend, bool any_change)
 {
   bool is_read = t->flags & CEPH_OSD_FLAG_READ;
   bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
@@ -1447,9 +1454,15 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
   const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
   bool force_resend = false;
   bool need_check_tiering = false;
+
   if (pi && osdmap->get_epoch() == pi->last_force_op_resend) {
-    force_resend = true;
+    if (last_force_resend && *last_force_resend < pi->last_force_op_resend) {
+	*last_force_resend = pi->last_force_op_resend;
+        force_resend = true;
+    } else if (last_force_resend == 0)
+      force_resend = true;
   }
+
   if (t->target_oid.name.empty() || force_resend) {
     t->target_oid = t->base_oid;
     need_check_tiering = true;
@@ -1483,9 +1496,33 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
     if (ret == -ENOENT)
       return RECALC_OP_TARGET_POOL_DNE;
   }
-  int primary;
-  vector<int> acting;
-  osdmap->pg_to_acting_osds(pgid, &acting, &primary);
+
+  int size = pi->size;
+  int min_size = pi->min_size;
+  unsigned pg_num = pi->get_pg_num();
+  int up_primary, acting_primary;
+  vector<int> up, acting;
+  osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
+			       &acting, &acting_primary);
+  unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
+  if (any_change && pg_interval_t::is_new_interval(
+          t->acting_primary,
+	  acting_primary,
+	  t->acting,
+	  acting,
+	  t->up_primary,
+	  up_primary,
+	  t->up,
+	  up,
+	  t->size,
+	  size,
+	  t->min_size,
+	  min_size,
+	  t->pg_num,
+	  pg_num,
+	  pg_t(prev_seed, pgid.pool(), pgid.preferred()))) {
+    force_resend = true;
+  }
 
   bool need_resend = false;
 
@@ -1497,15 +1534,22 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
 
   if (t->pgid != pgid ||
       is_pg_changed(
-	t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
+	t->acting_primary, t->acting, acting_primary, acting,
+	t->used_replica || any_change) ||
       force_resend) {
     t->pgid = pgid;
     t->acting = acting;
-    t->primary = primary;
-    ldout(cct, 10) << __func__ << " pgid " << pgid
-		   << " acting " << acting << dendl;
+    t->acting_primary = acting_primary;
+    t->up_primary = up_primary;
+    t->up = up;
+    t->size = size;
+    t->min_size = min_size;
+    t->pg_num = pg_num;
+    t->pg_num_mask = pi->get_pg_num_mask();
+    ldout(cct, 10) << __func__ << " "
+		   << " pgid " << pgid << " acting " << acting << dendl;
     t->used_replica = false;
-    if (primary == -1) {
+    if (acting_primary == -1) {
       t->osd = -1;
     } else {
       int osd;
@@ -1541,7 +1585,7 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
 	assert(best >= 0);
 	osd = acting[best];
       } else {
-	osd = primary;
+	osd = acting_primary;
       }
       t->osd = osd;
     }
@@ -1555,7 +1599,7 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
 
 int Objecter::recalc_op_target(Op *op)
 {
-  int r = calc_target(&op->target);
+  int r = calc_target(&op->target, &op->last_force_resend);
   if (r == RECALC_OP_TARGET_NEED_RESEND) {
     OSDSession *s = NULL;
     if (op->target.osd >= 0)
@@ -1576,7 +1620,7 @@ int Objecter::recalc_op_target(Op *op)
 
 bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
 {
-  int r = calc_target(&linger_op->target, true);
+  int r = calc_target(&linger_op->target, &linger_op->last_force_resend, true);
   if (r == RECALC_OP_TARGET_NEED_RESEND) {
     ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
 		   << " pgid " << linger_op->target.pgid
@@ -1610,7 +1654,7 @@ void Objecter::finish_op(Op *op)
   ldout(cct, 15) << "finish_op " << op->tid << dendl;
 
   op->session_item.remove_myself();
-  if (op->budgeted)
+  if (!op->ctx_budgeted && op->budgeted)
     put_op_budget(op);
 
   ops.erase(op->tid);
@@ -1915,6 +1959,10 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish)
     }
   }
   if (list_context->at_end_of_pool) {
+    // release the listing context's budget once all
+    // OPs (in the session) are finished
+    put_list_context_budget(list_context);
+
     onfinish->complete(0);
     return;
   }
@@ -1943,7 +1991,7 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish)
   C_List *onack = new C_List(list_context, onfinish, this);
   object_locator_t oloc(list_context->pool_id, list_context->nspace);
   pg_read(list_context->current_pg, oloc, op,
-	  &list_context->bl, 0, onack, &onack->epoch);
+	  &list_context->bl, 0, onack, &onack->epoch, &list_context->ctx_budget);
 }
 
 void Objecter::_list_reply(ListContext *list_context, int r,
@@ -1989,6 +2037,9 @@ void Objecter::_list_reply(ListContext *list_context, int r,
   }
   if (!list_context->list.empty()) {
     ldout(cct, 20) << " returning results so far" << dendl;
+    // release the listing context's budget once all
+    // OPs (in the session) are finished
+    put_list_context_budget(list_context);
     final_finish->complete(0);
     return;
   }
@@ -1997,6 +2048,13 @@ void Objecter::_list_reply(ListContext *list_context, int r,
   list_objects(list_context, final_finish);
 }
 
+void Objecter::put_list_context_budget(ListContext *list_context) {
+    if (list_context->ctx_budget >= 0) {
+      ldout(cct, 10) << " release listing context's budget " << list_context->ctx_budget << dendl;
+      put_op_budget_bytes(list_context->ctx_budget);
+      list_context->ctx_budget = -1;
+    }
+  }
 
 
 //snapshots
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 2ede888..b0739a1 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -1068,12 +1068,18 @@ public:
     object_t target_oid;
     object_locator_t target_oloc;
 
-    bool precalc_pgid;   ///< true if we are directed at base_pgid, not base_oid
-    pg_t base_pgid;      ///< explciti pg target, if any
-
-    pg_t pgid;           ///< last pg we mapped to
-    vector<int> acting;  ///< acting for last pg we mapped to
-    int primary;         ///< primary for last pg we mapped to
+    bool precalc_pgid;    ///< true if we are directed at base_pgid, not base_oid
+    pg_t base_pgid;       ///< explciti pg target, if any
+
+    pg_t pgid;            ///< last pg we mapped to
+    unsigned pg_num;      ///< last pg_num we mapped to
+    unsigned pg_num_mask; ///< last pg_num_mask we mapped to
+    vector<int> up;       ///< set of up osds for last pg we mapped to
+    vector<int> acting;   ///< set of acting osds for last pg we mapped to
+    int up_primary;       ///< primary for last pg we mapped to based on the up set
+    int acting_primary;   ///< primary for last pg we mapped to based on the acting set
+    int size;             ///< the size of the pool when were were last mapped
+    int min_size;         ///< the min size of the pool when were were last mapped
 
     bool used_replica;
     bool paused;
@@ -1085,7 +1091,12 @@ public:
 	base_oid(oid),
 	base_oloc(oloc),
 	precalc_pgid(false),
-	primary(-1),
+	pg_num(0),
+        pg_num_mask(0),
+	up_primary(-1),
+	acting_primary(-1),
+	size(-1),
+	min_size(-1),
 	used_replica(false),
 	paused(false),
 	osd(-1)
@@ -1133,6 +1144,13 @@ public:
     /// true if we should resend this message on failure
     bool should_resend;
 
+    epoch_t last_force_resend;
+
+    /// true if the throttle budget is get/put on a series of OPs, instead of
+    /// per OP basis, when this flag is set, the budget is acquired before sending
+    /// the very first OP of the series and released upon receiving the last OP reply.
+    bool ctx_budgeted;
+
     Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
        int f, Context *ac, Context *co, version_t *ov) :
       session(NULL), session_item(this), incarnation(0),
@@ -1146,7 +1164,9 @@ public:
       objver(ov), reply_epoch(NULL),
       map_dne_bound(0),
       budgeted(false),
-      should_resend(true) {
+      should_resend(true),
+      last_force_resend(0),
+      ctx_budgeted(false) {
       ops.swap(op);
       
       /* initialize out_* to match op vector */
@@ -1249,11 +1269,24 @@ public:
 
     bufferlist extra_info;
 
+    // The budget associated with this context, once it is set (>= 0),
+    // the budget is not get/released on OP basis, instead the budget
+    // is acquired before sending the first OP and released upon receiving
+    // the last op reply.
+    int ctx_budget;
+
     ListContext() : current_pg(0), current_pg_epoch(0), starting_pg_num(0),
 		    at_end_of_pool(false),
 		    at_end_of_pg(false),
 		    pool_id(0),
-		    pool_snap_seq(0), max_entries(0) {}
+		    pool_snap_seq(0),
+                    max_entries(0),
+                    nspace(),
+                    bl(),
+                    list(),
+                    filter(),
+                    extra_info(),
+                    ctx_budget(-1) {}
 
     bool at_end() const {
       return at_end_of_pool;
@@ -1372,6 +1405,7 @@ public:
 
     ceph_tid_t register_tid;
     epoch_t map_dne_bound;
+    epoch_t last_force_resend;
 
     LingerOp() : linger_id(0),
 		 target(object_t(), object_locator_t(), 0),
@@ -1381,7 +1415,8 @@ public:
 		 on_reg_ack(NULL), on_reg_commit(NULL),
 		 session(NULL), session_item(this),
 		 register_tid(0),
-		 map_dne_bound(0) {}
+		 map_dne_bound(0),
+                 last_force_resend(0) {}
 
     // no copy!
     const LingerOp &operator=(const LingerOp& r);
@@ -1480,7 +1515,7 @@ public:
   bool osdmap_full_flag() const;
   bool target_should_be_paused(op_target_t *op);
 
-  int calc_target(op_target_t *t, bool any_change=false);
+  int calc_target(op_target_t *t, epoch_t *last_force_resend=0, bool any_change=false);
   int recalc_op_target(Op *op);
   bool recalc_linger_op_target(LingerOp *op);
 
@@ -1517,7 +1552,7 @@ public:
    */
   int calc_op_budget(Op *op);
   void throttle_op(Op *op, int op_size=0);
-  void take_op_budget(Op *op) {
+  int take_op_budget(Op *op) {
     int op_budget = calc_op_budget(op);
     if (keep_balanced_budget) {
       throttle_op(op, op_budget);
@@ -1526,13 +1561,19 @@ public:
       op_throttle_ops.take(1);
     }
     op->budgeted = true;
+    return op_budget;
+  }
+  void put_op_budget_bytes(int op_budget) {
+    assert(op_budget >= 0);
+    op_throttle_bytes.put(op_budget);
+    op_throttle_ops.put(1);
   }
   void put_op_budget(Op *op) {
     assert(op->budgeted);
     int op_budget = calc_op_budget(op);
-    op_throttle_bytes.put(op_budget);
-    op_throttle_ops.put(1);
+    put_op_budget_bytes(op_budget);
   }
+  void put_list_context_budget(ListContext *list_context);
   Throttle op_throttle_bytes, op_throttle_ops;
 
  public:
@@ -1603,7 +1644,7 @@ private:
 
   // public interface
 public:
-  ceph_tid_t op_submit(Op *op);
+  ceph_tid_t op_submit(Op *op, int *ctx_budget = NULL);
   bool is_active() {
     return !(ops.empty() && linger_ops.empty() && poolstat_ops.empty() && statfs_ops.empty());
   }
@@ -1707,7 +1748,8 @@ public:
 		ObjectOperation& op,
 		bufferlist *pbl, int flags,
 		Context *onack,
-		epoch_t *reply_epoch) {
+		epoch_t *reply_epoch,
+                int *ctx_budget) {
     Op *o = new Op(object_t(), oloc,
 		   op.ops, flags | global_op_flags | CEPH_OSD_FLAG_READ,
 		   onack, NULL, NULL);
@@ -1720,7 +1762,11 @@ public:
     o->out_handler.swap(op.out_handler);
     o->out_rval.swap(op.out_rval);
     o->reply_epoch = reply_epoch;
-    return op_submit(o);
+    if (ctx_budget) {
+      // budget is tracked by listing context
+      o->ctx_budgeted = true;
+    }
+    return op_submit(o, ctx_budget);
   }
   ceph_tid_t linger_mutate(const object_t& oid, const object_locator_t& oloc,
 		      ObjectOperation& op,
diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf
index ec47f00..7e527e8 100644
--- a/src/rgw/logrotate.conf
+++ b/src/rgw/logrotate.conf
@@ -7,7 +7,7 @@
         if which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then
             invoke-rc.d radosgw reload >/dev/null
         elif which service > /dev/null 2>&1 && [ -x `which service` ]; then
-            service radosgw reload >/dev/null
+            service ceph-radosgw reload >/dev/null
         fi
         # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op
         if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index 7f246d1..5c15bcf 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -11,13 +11,18 @@ int RGWMongoose::write_data(const char *buf, int len)
 {
   if (!header_done) {
     header_data.append(buf, len);
-    return 0;
+    return len;
   }
   if (!sent_header) {
     data.append(buf, len);
-    return 0;
+    return len;
+  }
+  int r = mg_write(conn, buf, len);
+  if (r == 0) {
+    /* didn't send anything, error out */
+    return -EIO;
   }
-  return mg_write(conn, buf, len);
+  return r;
 }
 
 RGWMongoose::RGWMongoose(mg_connection *_conn, int _port) : conn(_conn), port(_port), header_done(false), sent_header(false), has_content_length(false),
diff --git a/src/rgw/rgw_client_io.cc b/src/rgw/rgw_client_io.cc
index 193f44e..32d99dc 100644
--- a/src/rgw/rgw_client_io.cc
+++ b/src/rgw/rgw_client_io.cc
@@ -54,7 +54,12 @@ int RGWClientIO::write(const char *buf, int len)
     return ret;
 
   if (account)
-    bytes_sent += len;
+    bytes_sent += ret;
+
+  if (ret < len) {
+    /* sent less than tried to send, error out */
+    return -EIO;
+  }
 
   return 0;
 }
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 8c0b40d..1c8720c 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -371,7 +371,7 @@ void RGWLoadGenProcess::run()
   int num_buckets;
   conf->get_val("num_buckets", 1, &num_buckets);
 
-  string buckets[num_buckets];
+  vector<string> buckets(num_buckets);
 
   atomic_t failed;
 
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index ec64777..804917c 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1798,6 +1798,12 @@ void RGWPostObj::execute()
     goto done;
   }
 
+  ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
+                           user_quota, bucket_quota, s->content_length);
+  if (ret < 0) {
+    goto done;
+  }
+
   processor = select_processor();
 
   ret = processor->prepare(store, s->obj_ctx, NULL);
@@ -1833,6 +1839,12 @@ void RGWPostObj::execute()
 
   s->obj_size = ofs;
 
+  ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
+                           user_quota, bucket_quota, s->obj_size);
+  if (ret < 0) {
+    goto done;
+  }
+
   hash.Final(m);
   buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
 
@@ -2070,6 +2082,7 @@ int RGWCopyObj::verify_permission()
 
   if (src_bucket_name.compare(dest_bucket_name) == 0) { /* will only happen if s->local_source */
     dest_bucket_info = src_bucket_info;
+    dest_attrs = src_attrs;
   } else {
     ret = store->get_bucket_info(s->obj_ctx, dest_bucket_name, dest_bucket_info, NULL, &dest_attrs);
     if (ret < 0)
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 25923e1..482b609 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -3534,6 +3534,31 @@ int RGWRados::copy_obj_data(void *ctx,
 }
 
 /**
+  * Check to see if the bucket metadata could be synced
+  * bucket: the bucket to check
+  * Returns false is the bucket is not synced
+  */
+bool RGWRados::is_syncing_bucket_meta(rgw_bucket& bucket)
+{
+  /* region is not master region */
+  if (!region.is_master) {
+    return false;
+  }
+
+  /* single region and a single zone */
+  if (region_map.regions.size() == 1 && region.zones.size() == 1) {
+    return false;
+  }
+
+  /* zone is not master */
+  if (region.master_zone.compare(zone_name) != 0) {
+    return false;
+  }
+
+  return true;
+}
+  
+/**
  * Delete a bucket.
  * bucket: the name of the bucket to delete
  * Returns 0 on success, -ERR# otherwise.
@@ -3572,6 +3597,16 @@ int RGWRados::delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_track
   if (r < 0)
     return r;
 
+  /* if the bucked is not synced we can remove the meta file */
+  if (!is_syncing_bucket_meta(bucket)) {
+    RGWObjVersionTracker objv_tracker;
+    string entry;
+    get_bucket_instance_entry(bucket, entry);
+    r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
+    if (r < 0) {
+      return r;
+    }
+  }
   return 0;
 }
 
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 2281f9a..5eee430 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -269,6 +269,12 @@ public:
       ::decode(rules, bl);
     } else {
       explicit_objs = true;
+      if (!objs.empty()) {
+        map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+        head_obj = iter->second.loc;
+        head_size = iter->second.size;
+        max_head_size = head_size;
+      }
     }
 
     if (struct_v >= 4) {
@@ -1595,6 +1601,11 @@ public:
    */
   virtual int delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_tracker);
 
+  /**
+   * Check to see if the bucket metadata is synced
+   */
+  bool is_syncing_bucket_meta(rgw_bucket& bucket);
+  
   int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner);
   int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled);
   int bucket_suspended(rgw_bucket& bucket, bool *suspended);
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 768ca09..13d1643 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -294,8 +294,11 @@ void dump_bucket_from_state(struct req_state *s)
 {
   int expose_bucket = g_conf->rgw_expose_bucket;
   if (expose_bucket) {
-    if (!s->bucket_name_str.empty())
-      s->cio->print("Bucket: \"%s\"\r\n", s->bucket_name_str.c_str());
+    if (!s->bucket_name_str.empty()) {
+      string b;
+      url_encode(s->bucket_name_str, b);
+      s->cio->print("Bucket: %s\r\n", b.c_str());
+    }
   }
 }
 
@@ -427,7 +430,8 @@ void dump_start(struct req_state *s)
   }
 }
 
-void end_header(struct req_state *s, RGWOp *op, const char *content_type)
+void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int64_t proposed_content_length,
+		bool force_content_type)
 {
   string ctype;
 
@@ -435,7 +439,13 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type)
     dump_access_control(s, op);
   }
 
-  if (!content_type || s->err.is_err()) {
+  if (s->prot_flags & RGW_REST_SWIFT && !content_type) {
+    force_content_type = true;
+  }
+
+  /* do not send content type if content length is zero
+     and the content type was not set by the user */
+  if (force_content_type || (!content_type &&  s->formatter->get_len()  != 0) || s->err.is_err()){
     switch (s->format) {
     case RGW_FORMAT_XML:
       ctype = "application/xml";
@@ -460,10 +470,18 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type)
       s->formatter->dump_string("Message", s->err.message);
     s->formatter->close_section();
     dump_content_length(s, s->formatter->get_len());
+  } else {
+    if (proposed_content_length != NO_CONTENT_LENGTH) {
+      dump_content_length(s, proposed_content_length);
+    }
   }
-  int r = s->cio->print("Content-type: %s\r\n", content_type);
-  if (r < 0) {
-    ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+
+  int r;
+  if (content_type) {
+      r = s->cio->print("Content-Type: %s\r\n", content_type);
+      if (r < 0) {
+	ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+      }
   }
   r = s->cio->complete_header();
   if (r < 0) {
@@ -1244,7 +1262,7 @@ int RGWREST::preprocess(struct req_state *s, RGWClientIO *cio)
       s->content_length = 0;
     } else {
       string err;
-      s->content_length = strict_strtol(s->length, 10, &err);
+      s->content_length = strict_strtoll(s->length, 10, &err);
       if (!err.empty()) {
         ldout(s->cct, 10) << "bad content length, aborting" << dendl;
         return -EINVAL;
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index a6108f4..0624e4f 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -354,10 +354,16 @@ public:
   }
 };
 
+static const int64_t NO_CONTENT_LENGTH = -1;
+
 extern void set_req_state_err(struct req_state *s, int err_no);
 extern void dump_errno(struct req_state *s);
 extern void dump_errno(struct req_state *s, int ret);
-extern void end_header(struct req_state *s, RGWOp *op = NULL, const char *content_type = NULL);
+extern void end_header(struct req_state *s,
+                       RGWOp *op = NULL,
+                       const char *content_type = NULL,
+                       const int64_t proposed_content_length = NO_CONTENT_LENGTH,
+		       bool force_content_type = false);
 extern void dump_start(struct req_state *s);
 extern void list_all_buckets_start(struct req_state *s);
 extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL);
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 2702472..4e4475a 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -51,8 +51,11 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
     ret = STATUS_NO_CONTENT;
     set_req_state_err(s, ret);
   }
-  dump_errno(s);
-  end_header(s, NULL);
+
+  if (!g_conf->rgw_swift_enforce_content_length) {
+    dump_errno(s);
+    end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true);
+  }
 
   if (!ret) {
     dump_start(s);
@@ -79,7 +82,9 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
       s->formatter->dump_int("bytes", obj.size);
     }
     s->formatter->close_section();
-    rgw_flush_formatter(s, s->formatter);
+    if (!g_conf->rgw_swift_enforce_content_length) {
+      rgw_flush_formatter(s, s->formatter);
+    }
   }
 }
 
@@ -87,6 +92,14 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
 {
   if (sent_data) {
     s->formatter->close_section();
+  }
+
+  if (g_conf->rgw_swift_enforce_content_length) {
+    dump_errno(s);
+    end_header(s, NULL, NULL, s->formatter->get_len(), true);
+  }
+
+  if (sent_data || g_conf->rgw_swift_enforce_content_length) {
     rgw_flush_formatter_and_reset(s, s->formatter);
   }
 }
@@ -204,14 +217,19 @@ next:
 
   s->formatter->close_section();
 
-  if (!ret && s->formatter->get_len() == 0)
-    ret = STATUS_NO_CONTENT;
-  else if (ret > 0)
+  int64_t content_len = 0;
+  if (!ret) {
+    content_len = s->formatter->get_len();
+    if (content_len == 0) {
+      ret = STATUS_NO_CONTENT;
+    }
+  } else if (ret > 0) {
     ret = 0;
+  }
 
   set_req_state_err(s, ret);
   dump_errno(s);
-  end_header(s, this);
+  end_header(s, this, NULL, content_len);
   if (ret < 0) {
     return;
   }
@@ -277,7 +295,8 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
   set_req_state_err(s, ret);
   dump_errno(s);
 
-  end_header(s, NULL);
+  end_header(s, NULL, NULL, 0,  true);
+
   dump_start(s);
 }
 
@@ -291,7 +310,7 @@ void RGWStatBucket_ObjStore_SWIFT::send_response()
   set_req_state_err(s, ret);
   dump_errno(s);
 
-  end_header(s, this);
+  end_header(s, this,NULL,0, true);
   dump_start(s);
 }
 
@@ -371,7 +390,8 @@ void RGWCreateBucket_ObjStore_SWIFT::send_response()
     ret = STATUS_ACCEPTED;
   set_req_state_err(s, ret);
   dump_errno(s);
-  end_header(s, NULL);
+  /* Propose ending HTTP header with 0 Content-Length header. */
+  end_header(s, NULL, NULL, 0);
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
@@ -383,7 +403,7 @@ void RGWDeleteBucket_ObjStore_SWIFT::send_response()
 
   set_req_state_err(s, r);
   dump_errno(s);
-  end_header(s, this);
+  end_header(s, this, NULL, 0);
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
@@ -597,6 +617,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
 
   dump_content_length(s, total_len);
   dump_last_modified(s, lastmod);
+  s->cio->print("X-Timestamp: %lld\r\n", (long long)lastmod);
 
   if (!ret) {
     map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc
index 1b724ed..6418f5b 100644
--- a/src/rgw/rgw_swift.cc
+++ b/src/rgw/rgw_swift.cc
@@ -520,6 +520,11 @@ int RGWSwift::validate_keystone_token(RGWRados *store, const string& token, stru
   if (ret < 0)
     return ret;
 
+  if (t.expired()) {
+    ldout(cct, 0) << "got expired token: " << t.token.tenant.name << ":" << t.user.name << " expired: " << t.token.expires << dendl;
+    return -EPERM;
+  }
+
   keystone_token_cache->add(token_id, t);
 
   ret = update_user_info(store, info, rgw_user);
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 69f9e84..9ede275 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -646,7 +646,7 @@ bin_DEBUGPROGRAMS += ceph_test_librbd
 if LINUX
 ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c
 ceph_test_librbd_fsx_LDADD = $(LIBRBD) $(LIBRADOS) -lm
-ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format
+ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS}
 bin_DEBUGPROGRAMS += ceph_test_librbd_fsx
 endif
 
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index ff87238..9190d91 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -2093,6 +2093,17 @@ TEST(BufferList, zero) {
     bl.zero((unsigned)3, (unsigned)3);
     EXPECT_EQ(0, ::memcmp("ABC\0\0\0GHIKLM", bl.c_str(), 9));
   }
+  {
+    bufferlist bl;
+    bufferptr ptr1(4);
+    bufferptr ptr2(4);
+    memset(ptr1.c_str(), 'a', 4);
+    memset(ptr2.c_str(), 'b', 4);
+    bl.append(ptr1);
+    bl.append(ptr2);
+    bl.zero((unsigned)2, (unsigned)4);
+    EXPECT_EQ(0, ::memcmp("aa\0\0\0\0bb", bl.c_str(), 8));
+  }
 }
 
 TEST(BufferList, EmptyAppend) {
diff --git a/src/test/librados/TestCase.cc b/src/test/librados/TestCase.cc
index 7f072fd..3747683 100644
--- a/src/test/librados/TestCase.cc
+++ b/src/test/librados/TestCase.cc
@@ -91,7 +91,14 @@ void RadosTestPP::cleanup_default_namespace(librados::IoCtx ioctx)
   for (ObjectIterator it = ioctx.objects_begin();
        it != ioctx.objects_end(); ++it) {
     ioctx.locator_set_key(it->second);
-    ASSERT_EQ(0, ioctx.remove(it->first));
+    ObjectWriteOperation op;
+    op.remove();
+    librados::AioCompletion *completion = s_cluster.aio_create_completion();
+    ASSERT_EQ(0, ioctx.aio_operate(it->first, completion, &op,
+				   librados::OPERATION_IGNORE_CACHE));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
   }
 }
 
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index a89d68b..9f3df30 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -75,16 +75,15 @@ protected:
   static void SetUpTestCase() {
     pool_name = get_temp_pool_name();
     ASSERT_EQ("", create_one_pool_pp(pool_name, s_cluster));
-    cache_pool_name = get_temp_pool_name();
-    ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
   }
   static void TearDownTestCase() {
-    ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
     ASSERT_EQ(0, destroy_one_pool_pp(pool_name, s_cluster));
   }
   static std::string cache_pool_name;
 
   virtual void SetUp() {
+    cache_pool_name = get_temp_pool_name();
+    ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
     RadosTestPP::SetUp();
     ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
     cache_ioctx.set_namespace(ns);
@@ -112,6 +111,7 @@ protected:
     cleanup_default_namespace(cache_ioctx);
 
     cache_ioctx.close();
+    ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
   }
   librados::IoCtx cache_ioctx;
 };
@@ -624,7 +624,24 @@ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
     ASSERT_TRUE(it == cache_ioctx.objects_end());
   }
 
+  // delete a whiteout and verify it goes away
   ASSERT_EQ(-ENOENT, ioctx.remove("foo"));
+  {
+    ObjectWriteOperation op;
+    op.remove();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, ioctx.aio_operate("bar", completion, &op,
+				   librados::OPERATION_IGNORE_CACHE));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+
+    ObjectIterator it = cache_ioctx.objects_begin();
+    ASSERT_TRUE(it != cache_ioctx.objects_end());
+    ASSERT_TRUE(it->first == string("foo"));
+    ++it;
+    ASSERT_TRUE(it == cache_ioctx.objects_end());
+  }
 
   // recreate an object and verify we can read it
   {
@@ -2154,6 +2171,91 @@ TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
   }
 }
 
+TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
+  // create object
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_count", 2),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_period", 600),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // 1st read, don't trigger a promote
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+  }
+
+  // verify the object is NOT present in the cache tier
+  {
+    ObjectIterator it = cache_ioctx.objects_begin();
+    ASSERT_TRUE(it == cache_ioctx.objects_end());
+  }
+
+  // Read until the object is present in the cache tier
+  while (true) {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+
+    ObjectIterator it = cache_ioctx.objects_begin();
+    if (it != cache_ioctx.objects_end()) {
+      ASSERT_TRUE(it->first == string("foo"));
+      ++it;
+      ASSERT_TRUE(it == cache_ioctx.objects_end());
+      break;
+    }
+
+    sleep(1);
+  }
+
+  // tear down tiers
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+    "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle before next test
+  cluster.wait_for_latest_osdmap();
+}
+
 class LibRadosTwoPoolsECPP : public RadosTestECPP
 {
 public:
@@ -2163,16 +2265,15 @@ protected:
   static void SetUpTestCase() {
     pool_name = get_temp_pool_name();
     ASSERT_EQ("", create_one_ec_pool_pp(pool_name, s_cluster));
-    cache_pool_name = get_temp_pool_name();
-    ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
   }
   static void TearDownTestCase() {
-    ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
     ASSERT_EQ(0, destroy_one_ec_pool_pp(pool_name, s_cluster));
   }
   static std::string cache_pool_name;
 
   virtual void SetUp() {
+    cache_pool_name = get_temp_pool_name();
+    ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
     RadosTestECPP::SetUp();
     ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
     cache_ioctx.set_namespace(ns);
@@ -2200,6 +2301,7 @@ protected:
     cleanup_default_namespace(cache_ioctx);
 
     cache_ioctx.close();
+    ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
   }
 
   librados::IoCtx cache_ioctx;
@@ -2640,7 +2742,23 @@ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
     ASSERT_TRUE(it == cache_ioctx.objects_end());
   }
 
+  // delete a whiteout and verify it goes away
   ASSERT_EQ(-ENOENT, ioctx.remove("foo"));
+  {
+    ObjectWriteOperation op;
+    op.remove();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, ioctx.aio_operate("bar", completion, &op,
+				   librados::OPERATION_IGNORE_CACHE));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+
+    ObjectIterator it = cache_ioctx.objects_begin();
+    ASSERT_TRUE(it != cache_ioctx.objects_end());
+    ++it;
+    ASSERT_TRUE(it == cache_ioctx.objects_end());
+  }
 
   // recreate an object and verify we can read it
   {
@@ -4019,6 +4137,91 @@ TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
   delete[] buf;
 }
 
+TEST_F(LibRadosTwoPoolsECPP, PromoteOn2ndRead) {
+  // create object
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_count", 2),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_period", 600),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // 1st read, don't trigger a promote
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+  }
+
+  // verify the object is NOT present in the cache tier
+  {
+    ObjectIterator it = cache_ioctx.objects_begin();
+    ASSERT_TRUE(it == cache_ioctx.objects_end());
+  }
+
+  // Read until the object is present in the cache tier
+  while (true) {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+
+    ObjectIterator it = cache_ioctx.objects_begin();
+    if (it != cache_ioctx.objects_end()) {
+      ASSERT_TRUE(it->first == string("foo"));
+      ++it;
+      ASSERT_TRUE(it == cache_ioctx.objects_end());
+      break;
+    }
+
+    sleep(1);
+  }
+
+  // tear down tiers
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+    "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle before next test
+  cluster.wait_for_latest_osdmap();
+}
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index c37d884..867e197 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -750,8 +750,16 @@ TEST(LibRBD, TestIO)
   ASSERT_EQ(10, rbd_write(image, info.size - 10, 100, test_data));
 
   rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
-  ASSERT_EQ(-EINVAL, rbd_aio_write(image, info.size, 1, test_data, comp));
-  ASSERT_EQ(-EINVAL, rbd_aio_read(image, info.size, 1, test_data, comp));
+  ASSERT_EQ(0, rbd_aio_write(image, info.size, 1, test_data, comp));
+  ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
+  ASSERT_EQ(-EINVAL, rbd_aio_get_return_value(comp));
+  rbd_aio_release(comp);
+
+  rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
+  ASSERT_EQ(0, rbd_aio_read(image, info.size, 1, test_data, comp));
+  ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
+  ASSERT_EQ(-EINVAL, rbd_aio_get_return_value(comp));
+  rbd_aio_release(comp);
 
   ASSERT_EQ(0, rbd_close(image));
 
@@ -1965,6 +1973,65 @@ TEST(LibRBD, TestPendingAio)
   ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
 }
 
+TEST(LibRBD, BlockingAIO)
+{
+  librados::Rados rados;
+  librados::IoCtx ioctx;
+  string pool_name = get_temp_pool_name();
+
+  ASSERT_EQ("", create_one_pool_pp(pool_name, rados));
+  ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = "testimg";
+  uint64_t size = 1 << 20;
+  int order = 18;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  CephContext *cct = reinterpret_cast<CephContext*>(ioctx.cct());
+  cct->_conf->set_val_or_die("rbd_non_blocking_aio", "0");
+
+  librbd::Image image;
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+  bufferlist bl;
+  bl.append(std::string(256, '1'));
+
+  librbd::RBD::AioCompletion *write_comp =
+    new librbd::RBD::AioCompletion(NULL, NULL);
+  ASSERT_EQ(0, image.aio_write(0, bl.length(), bl, write_comp));
+
+  librbd::RBD::AioCompletion *flush_comp =
+    new librbd::RBD::AioCompletion(NULL, NULL);
+  ASSERT_EQ(0, image.aio_flush(flush_comp));
+  ASSERT_EQ(0, flush_comp->wait_for_complete());
+  ASSERT_EQ(0, flush_comp->get_return_value());
+  flush_comp->release();
+
+  ASSERT_EQ(1, write_comp->is_complete());
+  ASSERT_EQ(0, write_comp->get_return_value());
+  write_comp->release();
+
+  librbd::RBD::AioCompletion *discard_comp =
+    new librbd::RBD::AioCompletion(NULL, NULL);
+  ASSERT_EQ(0, image.aio_discard(128, 128, discard_comp));
+  ASSERT_EQ(0, discard_comp->wait_for_complete());
+  discard_comp->release();
+
+  librbd::RBD::AioCompletion *read_comp =
+    new librbd::RBD::AioCompletion(NULL, NULL);
+  bufferlist read_bl;
+  image.aio_read(0, bl.length(), read_bl, read_comp);
+  ASSERT_EQ(0, read_comp->wait_for_complete());
+  ASSERT_EQ(bl.length(), read_comp->get_return_value());
+  read_comp->release();
+
+  bufferlist expected_bl;
+  expected_bl.append(std::string(128, '1'));
+  expected_bl.append(std::string(128, '\0'));
+  ASSERT_TRUE(expected_bl.contents_equal(read_bl));
+}
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/mon/test_mon_workloadgen.cc b/src/test/mon/test_mon_workloadgen.cc
index 3c6ff56..d1659d6 100644
--- a/src/test/mon/test_mon_workloadgen.cc
+++ b/src/test/mon/test_mon_workloadgen.cc
@@ -247,8 +247,7 @@ class ClientStub : public TestStub
       return err;
     }
 
-    messenger.reset(Messenger::create(cct, entity_name_t::CLIENT(-1),
-				      "stubclient", getpid()));
+    messenger.reset(Messenger::create_client_messenger(cct, "stubclient"));
     assert(messenger.get() != NULL);
 
     messenger->set_default_policy(
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index 8346c02..7c08962 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -29,6 +29,7 @@
 #include <gtest/gtest.h>
 
 #define LARGE_BLOCK_LEN CHAIN_XATTR_MAX_BLOCK_LEN + 1024
+#define FILENAME "bufferlist"
 
 TEST(chain_xattr, get_and_set) {
   const char* file = "testfile";
@@ -147,6 +148,44 @@ TEST(chain_xattr, get_and_set) {
   ::unlink(file);
 }
 
+TEST(chain_xattr, chunk_aligned) {
+  const char* file = FILENAME;
+  ::unlink(file);
+  int fd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700);
+  const string user("user.");
+
+  // set N* chunk size
+  const string name = "user.foo";
+  const string name2 = "user.bar";
+
+  for (int len = CHAIN_XATTR_MAX_BLOCK_LEN - 10;
+       len < CHAIN_XATTR_MAX_BLOCK_LEN + 10;
+       ++len) {
+    cout << len << std::endl;
+    const string x(len, 'x');
+    char buf[len*2];
+    ASSERT_EQ(len, chain_setxattr(file, name.c_str(), x.c_str(), len));
+    char attrbuf[4096];
+    int l = ceph_os_listxattr(file, attrbuf, sizeof(attrbuf));
+    for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+      cout << "  attr " << p << std::endl;
+    }
+    ASSERT_EQ(len, chain_getxattr(file, name.c_str(), buf, len*2));
+    ASSERT_EQ(0, chain_removexattr(file, name.c_str()));
+
+    ASSERT_EQ(len, chain_fsetxattr(fd, name2.c_str(), x.c_str(), len));
+    l = ceph_os_flistxattr(fd, attrbuf, sizeof(attrbuf));
+    for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+      cout << "  attr " << p << std::endl;
+    }
+    ASSERT_EQ(len, chain_fgetxattr(fd, name2.c_str(), buf, len*2));
+    ASSERT_EQ(0, chain_fremovexattr(fd, name2.c_str()));
+  }
+
+  ::close(fd);
+  ::unlink(file);
+}
+
 TEST(chain_xattr, listxattr) {
   const char* file = "testfile";
   ::unlink(file);
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index c2063b8..6dc6bec 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -138,6 +138,14 @@ public:
       fullauth.index();
       fulldiv.index();
     }
+    void set_div_bounds(eversion_t head, eversion_t tail) {
+      fulldiv.tail = divinfo.log_tail = tail;
+      fulldiv.head = divinfo.last_update = head;
+    }
+    void set_auth_bounds(eversion_t head, eversion_t tail) {
+      fullauth.tail = authinfo.log_tail = tail;
+      fullauth.head = authinfo.last_update = head;
+    }
     const IndexedLog &get_fullauth() const { return fullauth; }
     const IndexedLog &get_fulldiv() const { return fulldiv; }
     const pg_info_t &get_authinfo() const { return authinfo; }
@@ -235,6 +243,8 @@ public:
     proc_replica_log(
       t, oinfo, olog, omissing, pg_shard_t(1, 0));
 
+    assert(oinfo.last_update >= log.tail);
+
     if (!tcase.base.empty()) {
       ASSERT_EQ(tcase.base.rbegin()->version, oinfo.last_update);
     }
@@ -1270,8 +1280,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_shard_t from;
 
     eversion_t last_update(1, 1);
-    oinfo.last_update = last_update;
-    eversion_t last_complete(2, 1);
+    log.head = olog.head = oinfo.last_update = last_update;
+    eversion_t last_complete(1, 1);
     oinfo.last_complete = last_complete;
 
     EXPECT_TRUE(t.empty());
@@ -1470,12 +1480,12 @@ TEST_F(PGLogTest, proc_replica_log) {
   }
 
   /*        +--------------------------+
-            |  log              olog   |
+            |  olog              log   |
             +--------+-------+---------+
             |        |object |         |
             |version | hash  | version |
             |        |       |         |
-       tail > (1,1)  |  x5   |  (1,1)  < tail
+       tail > (1,1)  |  x9   |  (1,1)  < tail
             |        |       |         |
             |        |       |         |
             | (1,2)  |  x3   |  (1,2)  |
@@ -1503,34 +1513,38 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_shard_t from;
 
     eversion_t last_update(1, 2);
+    hobject_t divergent_object;
+    divergent_object.hash = 0x9;
 
     {
       pg_log_entry_t e;
       e.mod_desc.mark_unrollbackable();
 
       e.version = eversion_t(1, 1);
-      e.soid.hash = 0x5;
+      e.soid = divergent_object;
       log.tail = e.version;
       log.log.push_back(e);
       e.version = last_update;
       e.soid.hash = 0x3;
       log.log.push_back(e);
-      e.version = eversion_t(1,3);
-      e.soid.hash = 0x9;
+      e.version = eversion_t(2, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       e.op = pg_log_entry_t::DELETE;
       log.log.push_back(e);
       log.head = e.version;
       log.index();
 
       e.version = eversion_t(1, 1);
-      e.soid.hash = 0x5;
+      e.soid = divergent_object;
       olog.tail = e.version;
       olog.log.push_back(e);
       e.version = last_update;
       e.soid.hash = 0x3;
       olog.log.push_back(e);
-      e.version = eversion_t(2, 3);
-      e.soid.hash = 0x9;
+      e.version = eversion_t(1, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       e.op = pg_log_entry_t::DELETE;
       olog.log.push_back(e);
       olog.head = e.version;
@@ -1547,28 +1561,30 @@ TEST_F(PGLogTest, proc_replica_log) {
     proc_replica_log(t, oinfo, olog, omissing, from);
 
     EXPECT_TRUE(t.empty());
-    EXPECT_FALSE(omissing.have_missing());
+    EXPECT_TRUE(omissing.have_missing());
+    EXPECT_TRUE(omissing.is_missing(divergent_object));
+    EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+    EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
     EXPECT_EQ(last_update, oinfo.last_update);
-    EXPECT_EQ(last_update, oinfo.last_complete);
   }
 
   /*        +--------------------------+
-            |  log              olog   |
+            |  olog              log   |
             +--------+-------+---------+
             |        |object |         |
             |version | hash  | version |
             |        |       |         |
-       tail > (1,1)  |  x5   |  (1,1)  < tail
+       tail > (1,1)  |  x9   |  (1,1)  < tail
             |        |       |         |
             |        |       |         |
             | (1,2)  |  x3   |  (1,2)  |
             |        |       |         |
             |        |       |         |
        head > (1,3)  |  x9   |         |
-            | DELETE |       |         |
+            | MODIFY |       |         |
             |        |       |         |
             |        |  x9   |  (2,3)  < head
-            |        |       |  MODIFY |
+            |        |       |  DELETE |
             |        |       |         |
             +--------+-------+---------+
 
@@ -1593,28 +1609,30 @@ TEST_F(PGLogTest, proc_replica_log) {
       e.mod_desc.mark_unrollbackable();
 
       e.version = eversion_t(1, 1);
-      e.soid.hash = 0x5;
+      e.soid = divergent_object;
       log.tail = e.version;
       log.log.push_back(e);
       e.version = last_update;
       e.soid.hash = 0x3;
       log.log.push_back(e);
-      e.version = eversion_t(1, 3);
-      e.soid.hash = 0x9;
+      e.version = eversion_t(2, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       e.op = pg_log_entry_t::DELETE;
       log.log.push_back(e);
       log.head = e.version;
       log.index();
 
       e.version = eversion_t(1, 1);
-      e.soid.hash = 0x5;
+      e.soid = divergent_object;
       olog.tail = e.version;
       olog.log.push_back(e);
       e.version = last_update;
       e.soid.hash = 0x3;
       olog.log.push_back(e);
-      e.version = eversion_t(2, 3);
-      e.soid.hash = 0x9;
+      e.version = eversion_t(1, 3);
+      e.prior_version = eversion_t(1, 1);
+      e.soid = divergent_object;
       divergent_object = e.soid;
       omissing.add(divergent_object, e.version, eversion_t());
       e.op = pg_log_entry_t::MODIFY;
@@ -1628,16 +1646,18 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_TRUE(t.empty());
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
-    EXPECT_EQ(eversion_t(2, 3), omissing.missing[divergent_object].need);
+    EXPECT_EQ(eversion_t(1, 3), omissing.missing[divergent_object].need);
     EXPECT_EQ(olog.head, oinfo.last_update);
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     proc_replica_log(t, oinfo, olog, omissing, from);
 
     EXPECT_TRUE(t.empty());
-    EXPECT_FALSE(omissing.have_missing());
+    EXPECT_TRUE(omissing.have_missing());
+    EXPECT_TRUE(omissing.is_missing(divergent_object));
+    EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+    EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
     EXPECT_EQ(last_update, oinfo.last_update);
-    EXPECT_EQ(last_update, oinfo.last_complete);
   }
 
   /*        +--------------------------+
@@ -1862,6 +1882,20 @@ TEST_F(PGLogTest, merge_log_prior_version_have) {
   run_test_case(t);
 }
 
+TEST_F(PGLogTest, merge_log_split_missing_entries_at_head) {
+  TestCase t;
+  t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
+
+  t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(8, 70), mk_evt(8, 65)));
+
+  t.setup();
+  t.set_div_bounds(mk_evt(9, 79), mk_evt(8, 69));
+  t.set_auth_bounds(mk_evt(10, 160), mk_evt(9, 77));
+  t.final.add(mk_obj(1), mk_evt(15, 150), mk_evt(8, 70));
+  run_test_case(t);
+}
+
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc
index f66a3c6..1b6b90a 100644
--- a/src/tools/ceph_authtool.cc
+++ b/src/tools/ceph_authtool.cc
@@ -84,7 +84,7 @@ int main(int argc, const char **argv)
       gen_print_key = true;
     } else if (ceph_argparse_witharg(args, i, &val, "-a", "--add-key", (char*)NULL)) {
       add_key = val;
-    } else if (ceph_argparse_flag(args, i, &val, "-l", "--list", (char*)NULL)) {
+    } else if (ceph_argparse_flag(args, i, "-l", "--list", (char*)NULL)) {
       list = true;
     } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
       caps_fn = val;
diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf
index 77841cd..4063d91 100644
--- a/src/upstart/ceph-mds.conf
+++ b/src/upstart/ceph-mds.conf
@@ -4,7 +4,7 @@ start on ceph-mds
 stop on runlevel [!2345] or stopping ceph-mds-all
 
 respawn
-respawn limit 5 30
+respawn limit 3 1800
 
 limit nofile 16384 16384
 
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 0279f15..83c9858 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -4,7 +4,7 @@ start on ceph-mon
 stop on runlevel [!2345] or stopping ceph-mon-all
 
 respawn
-respawn limit 5 30
+respawn limit 3 1800
 
 limit nofile 16384 16384
 
diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf
index 7175c2d..6fa97ed 100644
--- a/src/upstart/ceph-osd.conf
+++ b/src/upstart/ceph-osd.conf
@@ -4,7 +4,7 @@ start on ceph-osd
 stop on runlevel [!2345] or stopping ceph-osd-all
 
 respawn
-respawn limit 5 30
+respawn limit 3 1800
 
 limit nofile 32768 32768
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list