[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.80~rc1

Dmitry Smirnov onlyjob at moszumanska.debian.org
Wed Apr 23 17:55:46 UTC 2014


This is an automated email from the git hooks/post-receive script.

onlyjob pushed a commit to branch upstream
in repository ceph.

commit 3a87546 (upstream)
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date:   Wed Apr 23 17:06:46 2014

    Imported Upstream version 0.80~rc1
---
 ceph.spec                                          |   15 +-
 ceph.spec.in                                       |   15 +-
 configure                                          |  285 +++-
 configure.ac                                       |    7 +-
 man/crushtool.8                                    |    2 +-
 src/.git_version                                   |    4 +-
 src/acconfig.h.in                                  |   10 +
 src/auth/AuthClientHandler.h                       |   32 +-
 src/auth/cephx/CephxClientHandler.cc               |   33 +-
 src/auth/cephx/CephxClientHandler.h                |   24 +-
 src/auth/cephx/CephxProtocol.cc                    |   14 +-
 src/auth/cephx/CephxProtocol.h                     |    8 +-
 src/auth/none/AuthNoneClientHandler.h              |   20 +-
 src/auth/unknown/AuthUnknownClientHandler.h        |   20 +-
 src/brag/README.md                                 |    3 +-
 src/brag/client/ceph-brag                          |   43 +-
 src/brag/server/ceph_brag/json.py                  |    3 +-
 src/brag/server/ceph_brag/model/db.py              |    6 +-
 src/ceph_mon.cc                                    |    8 +-
 src/cephfs.cc                                      |   20 +-
 src/civetweb/civetweb.h                            |  144 +-
 src/civetweb/include/civetweb.h                    |  144 +-
 src/civetweb/src/civetweb.c                        | 1112 +++++++++-----
 src/client/Client.cc                               |  313 ++--
 src/client/Client.h                                |   11 +-
 src/client/Inode.cc                                |    8 +-
 src/client/Inode.h                                 |    2 +-
 src/client/SyntheticClient.cc                      |    7 +-
 src/client/fuse_ll.cc                              |    4 +-
 src/cls/lock/cls_lock.cc                           |    7 +-
 src/cls/rbd/cls_rbd.cc                             |   27 +-
 src/common/Preforker.h                             |    3 +-
 src/common/RWLock.h                                |   14 +-
 src/common/buffer.cc                               |   18 +-
 src/common/common_init.cc                          |    1 -
 src/common/config.cc                               |    3 +-
 src/common/config_opts.h                           |    2 +
 src/common/errno.cc                                |   15 +-
 src/common/obj_bencher.cc                          |   59 +-
 src/common/obj_bencher.h                           |    6 +-
 src/crush/CrushCompiler.cc                         |    5 +-
 src/crush/CrushWrapper.cc                          |    4 +-
 src/erasure-code/ErasureCodePlugin.cc              |    3 +-
 .../jerasure/ErasureCodePluginJerasure.cc          |   19 +
 src/include/encoding.h                             |    4 +-
 src/include/memory.h                               |    2 +
 src/include/rados.h                                |    4 +
 src/include/rados/librados.h                       |   24 +-
 src/include/rados/librados.hpp                     |   11 +
 src/include/rados/memory.h                         |    2 +
 src/include/utime.h                                |    6 +-
 src/librados/AioCompletionImpl.h                   |    4 +-
 src/librados/IoCtxImpl.cc                          |   19 +-
 src/librados/librados.cc                           |   26 +-
 src/librbd/internal.cc                             |   19 +-
 src/mds/CDentry.h                                  |    4 +-
 src/mds/CDir.cc                                    |   21 +-
 src/mds/CDir.h                                     |    9 +-
 src/mds/CInode.cc                                  |   16 +-
 src/mds/CInode.h                                   |    4 +-
 src/mds/Capability.h                               |   21 +-
 src/mds/Locker.cc                                  |  152 +-
 src/mds/Locker.h                                   |   65 +-
 src/mds/LogSegment.h                               |    2 +-
 src/mds/MDCache.cc                                 |  374 ++---
 src/mds/MDCache.h                                  |   80 +-
 src/mds/MDLog.cc                                   |   19 +-
 src/mds/MDLog.h                                    |    9 +-
 src/mds/MDS.cc                                     |   37 +-
 src/mds/MDS.h                                      |    4 +-
 src/mds/Migrator.cc                                |   48 +-
 src/mds/Migrator.h                                 |   12 +-
 src/mds/Mutation.cc                                |   73 +-
 src/mds/Mutation.h                                 |   43 +-
 src/mds/Server.cc                                  |  392 +++--
 src/mds/Server.h                                   |  179 +--
 src/mds/SessionMap.h                               |    4 +-
 src/mds/SimpleLock.h                               |   23 +-
 src/mds/journal.cc                                 |    4 +-
 src/messages/MAuthReply.h                          |    4 +-
 src/messages/MClientReply.h                        |    4 +-
 src/messages/MMonProbe.h                           |   15 +-
 src/messages/MOSDOpReply.h                         |    4 +-
 src/messages/MOSDSubOp.h                           |   11 +-
 src/mon/ConfigKeyService.cc                        |    1 +
 src/mon/DataHealthService.cc                       |    1 +
 src/mon/Elector.cc                                 |   11 +-
 src/mon/Elector.h                                  |    3 -
 src/mon/MonClient.cc                               |    5 +-
 src/mon/MonCommands.h                              |   15 +-
 src/mon/Monitor.cc                                 |  105 +-
 src/mon/Monitor.h                                  |    8 +-
 src/mon/MonitorStore.cc                            |   13 +-
 src/mon/MonmapMonitor.cc                           |    1 +
 src/mon/OSDMonitor.cc                              |   43 +-
 src/mon/Paxos.h                                    |    4 +-
 src/msg/Accepter.cc                                |   13 +-
 src/msg/Pipe.cc                                    |   85 +-
 src/msg/Pipe.h                                     |    7 +-
 src/os/JournalingObjectStore.cc                    |    5 +-
 src/os/JournalingObjectStore.h                     |    7 +
 src/osd/ECBackend.cc                               |    4 +
 src/osd/ECBackend.h                                |    2 +
 src/osd/ECMsgTypes.cc                              |   22 +-
 src/osd/ECMsgTypes.h                               |    7 +-
 src/osd/OSD.cc                                     |   59 +-
 src/osd/OSD.h                                      |    2 +-
 src/osd/OSDMap.cc                                  |    5 +-
 src/osd/PG.cc                                      |   94 +-
 src/osd/PG.h                                       |   12 +-
 src/osd/PGBackend.h                                |    3 +
 src/osd/PGLog.cc                                   |    1 +
 src/osd/ReplicatedBackend.cc                       |    4 +-
 src/osd/ReplicatedBackend.h                        |    2 +
 src/osd/ReplicatedPG.cc                            |  843 ++++++++---
 src/osd/ReplicatedPG.h                             |   93 +-
 src/osd/osd_types.cc                               |  108 +-
 src/osd/osd_types.h                                |   95 +-
 src/osdc/ObjectCacher.cc                           |   14 +
 src/osdc/ObjectCacher.h                            |    1 +
 src/osdc/Objecter.cc                               |  227 +--
 src/osdc/Objecter.h                                |   70 +-
 src/pybind/rados.py                                |   49 +-
 src/rgw/rgw_admin.cc                               |    2 +-
 src/rgw/rgw_user.cc                                |   60 +-
 src/rgw/rgw_user.h                                 |    1 -
 src/test/admin_socket.cc                           |    1 +
 src/test/cli/monmaptool/print-nonexistent.t        |    2 +-
 src/test/cli/radosgw-admin/help.t                  |    2 +-
 src/test/cls_rbd/test_cls_rbd.cc                   |    2 +-
 src/test/librados/TestCase.cc                      |   81 +
 src/test/librados/TestCase.h                       |   19 +
 src/test/librados/aio.cc                           | 1592 +++++++++++++++++++-
 src/test/librados/c_read_operations.cc             |    2 +-
 src/test/librados/io.cc                            |  126 +-
 src/test/librados/list.cc                          |   96 +-
 src/test/librados/misc.cc                          |    9 +-
 src/test/librados/snapshots.cc                     |  120 +-
 src/test/librados/stat.cc                          |   40 +-
 src/test/librados/tier.cc                          |   31 +-
 src/test/librados/watch_notify.cc                  |   23 +-
 src/test/librbd/test_librbd.cc                     |   82 +
 src/test/osd/RadosModel.h                          |  112 +-
 src/test/osd/TestRados.cc                          |   15 +-
 src/test/osd/types.cc                              |   68 +
 src/test/system/rados_list_parallel.cc             |    2 +-
 src/test/system/st_rados_create_pool.cc            |    2 +-
 src/test/test_stress_watch.cc                      |   10 +-
 src/tools/ceph_monstore_tool.cc                    |    1 +
 src/tools/ceph_osdomap_tool.cc                     |    1 +
 src/tools/crushtool.cc                             |    5 +-
 src/tools/monmaptool.cc                            |    6 +-
 src/tools/rados/rados.cc                           |  104 +-
 src/tools/rest_bench.cc                            |   16 +-
 src/tools/scratchtoolpp.cc                         |   17 +-
 src/vstart.sh                                      |    2 +-
 156 files changed, 6398 insertions(+), 2555 deletions(-)

diff --git a/ceph.spec b/ceph.spec
index b4f21d0..207c94a 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -9,13 +9,13 @@
 # common
 #################################################################################
 Name:		ceph
-Version:	0.79
-Release:	0%{?dist}
+Version:        0.80
+Release:        rc1%{?dist}
 Summary:	User space components of the Ceph file system
 License:	GPL-2.0
 Group:		System Environment/Base
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
+Source0:        http://ceph.com/download/%{name}-%{version}-rc1.tar.bz2
 Requires:	librbd1 = %{version}-%{release}
 Requires:	librados2 = %{version}-%{release}
 Requires:	libcephfs1 = %{version}-%{release}
@@ -28,6 +28,7 @@ Requires:	cryptsetup
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
+Requires:       redhat-lsb-core
 Requires(post):	binutils
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:	make
@@ -87,8 +88,10 @@ BuildRequires:  gperftools-devel
 %endif
 
 %description
-Ceph is a distributed network file system designed to provide excellent
-performance, reliability, and scalability.
+Ceph is a massively scalable, open-source, distributed
+storage system that runs on commodity hardware and delivers object,
+block and file system storage.
+
 
 #################################################################################
 # packages
@@ -256,7 +259,7 @@ This package contains the Java libraries for the Ceph File System.
 # common
 #################################################################################
 %prep
-%setup -q
+%setup -q -n %{name}-%{version}-rc1
 
 %build
 # Find jni.h
diff --git a/ceph.spec.in b/ceph.spec.in
index ee9dbd9..207c94a 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -9,13 +9,13 @@
 # common
 #################################################################################
 Name:		ceph
-Version:	@VERSION@
-Release:	@RPM_RELEASE@%{?dist}
+Version:        0.80
+Release:        rc1%{?dist}
 Summary:	User space components of the Ceph file system
 License:	GPL-2.0
 Group:		System Environment/Base
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
+Source0:        http://ceph.com/download/%{name}-%{version}-rc1.tar.bz2
 Requires:	librbd1 = %{version}-%{release}
 Requires:	librados2 = %{version}-%{release}
 Requires:	libcephfs1 = %{version}-%{release}
@@ -28,6 +28,7 @@ Requires:	cryptsetup
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
+Requires:       redhat-lsb-core
 Requires(post):	binutils
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:	make
@@ -87,8 +88,10 @@ BuildRequires:  gperftools-devel
 %endif
 
 %description
-Ceph is a distributed network file system designed to provide excellent
-performance, reliability, and scalability.
+Ceph is a massively scalable, open-source, distributed
+storage system that runs on commodity hardware and delivers object,
+block and file system storage.
+
 
 #################################################################################
 # packages
@@ -256,7 +259,7 @@ This package contains the Java libraries for the Ceph File System.
 # common
 #################################################################################
 %prep
-%setup -q
+%setup -q -n %{name}-%{version}-rc1
 
 %build
 # Find jni.h
diff --git a/configure b/configure
index 35653bd..2f1b571 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ceph 0.79.
+# Generated by GNU Autoconf 2.68 for ceph 0.80-rc1.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -570,8 +570,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.79'
-PACKAGE_STRING='ceph 0.79'
+PACKAGE_VERSION='0.80-rc1'
+PACKAGE_STRING='ceph 0.80-rc1'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1441,7 +1441,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.79 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80-rc1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1512,7 +1512,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.79:";;
+     short | recursive ) echo "Configuration of ceph 0.80-rc1:";;
    esac
   cat <<\_ACEOF
 
@@ -1657,7 +1657,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.79
+ceph configure 0.80-rc1
 generated by GNU Autoconf 2.68
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -2345,11 +2345,166 @@ $as_echo "$ac_res" >&6; }
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
 
 } # ac_fn_c_check_type
+
+# ac_fn_cxx_check_decl LINENO SYMBOL VAR INCLUDES
+# -----------------------------------------------
+# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
+# accordingly.
+ac_fn_cxx_check_decl ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  as_decl_name=`echo $2|sed 's/ *(.*//'`
+  as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
+$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+#ifndef $as_decl_name
+#ifdef __cplusplus
+  (void) $as_decl_use;
+#else
+  (void) $as_decl_name;
+#endif
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_decl
+
+# ac_fn_cxx_check_func LINENO FUNC VAR
+# ------------------------------------
+# Tests whether FUNC exists, setting the cache variable VAR accordingly
+ac_fn_cxx_check_func ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define $2 innocuous_$2
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $2 (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef $2
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $2 ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_$2 || defined __stub___$2
+choke me
+#endif
+
+int
+main ()
+{
+return $2 ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_cxx_check_func
+
+# ac_fn_cxx_try_run LINENO
+# ------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_cxx_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_run
 cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.79, which was
+It was created by ceph $as_me 0.80-rc1, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   $ $0 $@
@@ -4349,7 +4504,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.79'
+ VERSION='0.80-rc1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12327,7 +12482,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.79'
+ VERSION='0.80-rc1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -18751,7 +18906,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 18754 "configure" */
+/* #line 18909 "configure" */
 public class Test {
 }
 EOF
@@ -21104,6 +21259,112 @@ fi
 #AC_FUNC_UTIME_NULL
 #AC_CHECK_FUNCS([bzero fchdir fdatasync floor ftruncate getcwd gethostbyname gethostname gettimeofday inet_ntoa localtime_r memmove memset mkdir munmap pow rmdir select socket sqrt strcasecmp strchr strerror strstr utime])
 
+# check for return type (and presence) if strerror_r in C++ mode
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+ac_fn_cxx_check_decl "$LINENO" "strerror_r" "ac_cv_have_decl_strerror_r" "$ac_includes_default"
+if test "x$ac_cv_have_decl_strerror_r" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_STRERROR_R $ac_have_decl
+_ACEOF
+
+for ac_func in strerror_r
+do :
+  ac_fn_cxx_check_func "$LINENO" "strerror_r" "ac_cv_func_strerror_r"
+if test "x$ac_cv_func_strerror_r" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_STRERROR_R 1
+_ACEOF
+
+fi
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether strerror_r returns char *" >&5
+$as_echo_n "checking whether strerror_r returns char *... " >&6; }
+if ${ac_cv_func_strerror_r_char_p+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+    ac_cv_func_strerror_r_char_p=no
+    if test $ac_cv_have_decl_strerror_r = yes; then
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+
+	  char buf[100];
+	  char x = *strerror_r (0, buf, sizeof buf);
+	  char *p = strerror_r (0, buf, sizeof buf);
+	  return !p || x;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_func_strerror_r_char_p=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    else
+      # strerror_r is not declared.  Choose between
+      # systems that have relatively inaccessible declarations for the
+      # function.  BeOS and DEC UNIX 4.0 fall in this category, but the
+      # former has a strerror_r that returns char*, while the latter
+      # has a strerror_r that returns `int'.
+      # This test should segfault on the DEC system.
+      if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+	extern char *strerror_r ();
+int
+main ()
+{
+char buf[100];
+	  char x = *strerror_r (0, buf, sizeof buf);
+	  return ! isalpha (x);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_run "$LINENO"; then :
+  ac_cv_func_strerror_r_char_p=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+    fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_strerror_r_char_p" >&5
+$as_echo "$ac_cv_func_strerror_r_char_p" >&6; }
+if test $ac_cv_func_strerror_r_char_p = yes; then
+
+$as_echo "#define STRERROR_R_CHAR_P 1" >>confdefs.h
+
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
  if test "$WITH_BUILD_TESTS" = "1"; then
   WITH_BUILD_TESTS_TRUE=
   WITH_BUILD_TESTS_FALSE='#'
@@ -21997,7 +22258,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.79, which was
+This file was extended by ceph $as_me 0.80-rc1, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22063,7 +22324,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.79
+ceph config.status 0.80-rc1
 configured by $0, generated by GNU Autoconf 2.68,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 035f525..e5b380f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.79], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80-rc1], [ceph-devel at vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
@@ -768,6 +768,11 @@ AC_CHECK_TYPES([__u8, __s8, __u16, __s16, __u32, __s32, __u64, __s64, __le16,
 #AC_FUNC_UTIME_NULL
 #AC_CHECK_FUNCS([bzero fchdir fdatasync floor ftruncate getcwd gethostbyname gethostname gettimeofday inet_ntoa localtime_r memmove memset mkdir munmap pow rmdir select socket sqrt strcasecmp strchr strerror strstr utime])
 
+# check for return type (and presence) if strerror_r in C++ mode
+AC_LANG_PUSH([C++])
+AC_FUNC_STRERROR_R
+AC_LANG_POP([C++])
+
 AM_CONDITIONAL(WITH_BUILD_TESTS, test "$WITH_BUILD_TESTS" = "1")
 
 AM_PATH_PYTHON([2.4],
diff --git a/man/crushtool.8 b/man/crushtool.8
index 38a379b..bb518f7 100644
--- a/man/crushtool.8
+++ b/man/crushtool.8
@@ -125,7 +125,7 @@ CEPH_ARGS="\-\-debug\-crush 0" crushtool ...
 The test mode will use the input crush map ( as specified with \fB\-i
 map\fP ) and perform a dry run of CRUSH mapping or random placement (
 if \fB\-\-simulate\fP is set ). On completion, two kinds of reports can be
-created. The \fB\-\-show\-...\fP options output human readable informations
+created. The \fB\-\-show\-...\fP options output human readable information
 on stderr. The \fB\-\-output\-csv\fP option creates CSV files that are
 documented by the \fB\-\-help\-output\fP option.
 .INDENT 0.0
diff --git a/src/.git_version b/src/.git_version
index 28b5254..e0522fa 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-4c2d73a5095f527c3a2168deb5fa54b3c8991a6e
-v0.79
+6769f4dc88425396921f94e1a37a1c90758aa3ea
+v0.80-rc1
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 21baa07..24791f8 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -33,6 +33,10 @@
 /* Define if have curl_multi_wait() */
 #undef HAVE_CURL_MULTI_WAIT
 
+/* Define to 1 if you have the declaration of `strerror_r', and to 0 if you
+   don't. */
+#undef HAVE_DECL_STRERROR_R
+
 /* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
    */
 #undef HAVE_DIRENT_H
@@ -197,6 +201,9 @@
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 
+/* Define to 1 if you have the `strerror_r' function. */
+#undef HAVE_STRERROR_R
+
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 
@@ -363,6 +370,9 @@
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
+/* Define to 1 if strerror_r returns char *. */
+#undef STRERROR_R_CHAR_P
+
 /* Define if using CryptoPP. */
 #undef USE_CRYPTOPP
 
diff --git a/src/auth/AuthClientHandler.h b/src/auth/AuthClientHandler.h
index 5d129ad..e12cbb9 100644
--- a/src/auth/AuthClientHandler.h
+++ b/src/auth/AuthClientHandler.h
@@ -20,6 +20,7 @@
 
 #include "common/Mutex.h"
 #include "common/Cond.h"
+#include "common/RWLock.h"
 
 #include "common/Timer.h"
 
@@ -36,49 +37,42 @@ protected:
   uint32_t want;
   uint32_t have;
   uint32_t need;
+  RWLock lock;
 
 public:
   AuthClientHandler(CephContext *cct_) 
-    : cct(cct_), global_id(0), want(CEPH_ENTITY_TYPE_AUTH), have(0), need(0) {}
+    : cct(cct_), global_id(0), want(CEPH_ENTITY_TYPE_AUTH), have(0), need(0),
+      lock("AuthClientHandler::lock") {}
   virtual ~AuthClientHandler() {}
 
   void init(EntityName& n) { name = n; }
   
   void set_want_keys(__u32 keys) {
+    RWLock::WLocker l(lock);
     want = keys | CEPH_ENTITY_TYPE_AUTH;
     validate_tickets();
   }
   void add_want_keys(__u32 keys) {
+    RWLock::WLocker l(lock);
     want |= keys;
     validate_tickets();
   }   
 
-  bool have_keys(__u32 k) {
-    validate_tickets();
-    return (k & have) == have;
-  }
-  bool have_keys() {
-    validate_tickets();
-    return (want & have) == have;
-  }
-
-
-  virtual int get_protocol() = 0;
+  virtual int get_protocol() const = 0;
 
   virtual void reset() = 0;
-  virtual int build_request(bufferlist& bl) = 0;
+  virtual void prepare_build_request() = 0;
+  virtual int build_request(bufferlist& bl) const = 0;
   virtual int handle_response(int ret, bufferlist::iterator& iter) = 0;
-  virtual bool build_rotating_request(bufferlist& bl) = 0;
-
-  virtual void tick() = 0;
+  virtual bool build_rotating_request(bufferlist& bl) const = 0;
 
-  virtual AuthAuthorizer *build_authorizer(uint32_t service_id) = 0;
+  virtual AuthAuthorizer *build_authorizer(uint32_t service_id) const = 0;
 
-  virtual void validate_tickets() = 0;
   virtual bool need_tickets() = 0;
 
   virtual void set_global_id(uint64_t id) = 0;
-  uint64_t get_global_id() { return global_id; }
+protected:
+  virtual void validate_tickets() = 0;
 };
 
 
diff --git a/src/auth/cephx/CephxClientHandler.cc b/src/auth/cephx/CephxClientHandler.cc
index 2818b7a..b6d3501 100644
--- a/src/auth/cephx/CephxClientHandler.cc
+++ b/src/auth/cephx/CephxClientHandler.cc
@@ -27,16 +27,11 @@
 #define dout_prefix *_dout << "cephx client: "
 
 
-int CephxClientHandler::build_request(bufferlist& bl)
+int CephxClientHandler::build_request(bufferlist& bl) const
 {
   ldout(cct, 10) << "build_request" << dendl;
 
-  ldout(cct, 10) << "validate_tickets: want=" << want << " need=" << need << " have=" << have << dendl;
-  validate_tickets();
-
-  ldout(cct, 10) << "want=" << want << " need=" << need << " have=" << have << dendl;
-
-  CephXTicketHandler& ticket_handler = tickets.get_handler(CEPH_ENTITY_TYPE_AUTH);
+  RWLock::RLocker l(lock);
 
   if (need & CEPH_ENTITY_TYPE_AUTH) {
     /* authenticate */
@@ -57,7 +52,7 @@ int CephxClientHandler::build_request(bufferlist& bl)
       return -EIO;
     }
 
-    req.old_ticket = ticket_handler.ticket;
+    req.old_ticket = ticket_handler->ticket;
 
     if (req.old_ticket.blob.length()) {
       ldout(cct, 20) << "old ticket len=" << req.old_ticket.blob.length() << dendl;
@@ -77,7 +72,7 @@ int CephxClientHandler::build_request(bufferlist& bl)
     header.request_type = CEPHX_GET_PRINCIPAL_SESSION_KEY;
     ::encode(header, bl);
 
-    CephXAuthorizer *authorizer = ticket_handler.build_authorizer(global_id);
+    CephXAuthorizer *authorizer = ticket_handler->build_authorizer(global_id);
     if (!authorizer)
       return -EINVAL;
     bl.claim_append(authorizer->bl);
@@ -94,6 +89,7 @@ int CephxClientHandler::build_request(bufferlist& bl)
 int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata)
 {
   ldout(cct, 10) << "handle_response ret = " << ret << dendl;
+  RWLock::WLocker l(lock);
   
   if (ret < 0)
     return ret; // hrm!
@@ -176,14 +172,15 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata)
 
 
 
-AuthAuthorizer *CephxClientHandler::build_authorizer(uint32_t service_id)
+AuthAuthorizer *CephxClientHandler::build_authorizer(uint32_t service_id) const
 {
+  RWLock::RLocker l(lock);
   ldout(cct, 10) << "build_authorizer for service " << ceph_entity_type_name(service_id) << dendl;
   return tickets.build_authorizer(service_id);
 }
 
 
-bool CephxClientHandler::build_rotating_request(bufferlist& bl)
+bool CephxClientHandler::build_rotating_request(bufferlist& bl) const
 {
   ldout(cct, 10) << "build_rotating_request" << dendl;
   CephXRequestHeader header;
@@ -192,13 +189,27 @@ bool CephxClientHandler::build_rotating_request(bufferlist& bl)
   return true;
 }
 
+void CephxClientHandler::prepare_build_request()
+{
+  RWLock::WLocker l(lock);
+  ldout(cct, 10) << "validate_tickets: want=" << want << " need=" << need
+		 << " have=" << have << dendl;
+  validate_tickets();
+  ldout(cct, 10) << "want=" << want << " need=" << need << " have=" << have
+		 << dendl;
+
+  ticket_handler = &(tickets.get_handler(CEPH_ENTITY_TYPE_AUTH));
+}
+
 void CephxClientHandler::validate_tickets()
 {
+  // lock should be held for write
   tickets.validate_tickets(want, have, need);
 }
 
 bool CephxClientHandler::need_tickets()
 {
+  RWLock::WLocker l(lock);
   validate_tickets();
 
   ldout(cct, 20) << "need_tickets: want=" << want << " need=" << need << " have=" << have << dendl;
diff --git a/src/auth/cephx/CephxClientHandler.h b/src/auth/cephx/CephxClientHandler.h
index bf8108a..d200ac9 100644
--- a/src/auth/cephx/CephxClientHandler.h
+++ b/src/auth/cephx/CephxClientHandler.h
@@ -22,12 +22,13 @@ class CephContext;
 
 class CephxClientHandler : public AuthClientHandler {
   bool starting;
-  
+
   /* envelope protocol parameters */
   uint64_t server_challenge;
-  
+
   CephXTicketManager tickets;
-  
+  CephXTicketHandler* ticket_handler;
+
   RotatingKeyRing *rotating_secrets;
   KeyRing *keyring;
 
@@ -37,6 +38,7 @@ public:
       starting(false),
       server_challenge(0),
       tickets(cct_),
+      ticket_handler(NULL),
       rotating_secrets(rsecrets),
       keyring(rsecrets->get_keyring())
   {
@@ -44,26 +46,28 @@ public:
   }
 
   void reset() {
+    RWLock::WLocker l(lock);
     starting = true;
     server_challenge = 0;
   }
-  int build_request(bufferlist& bl);
+  void prepare_build_request();
+  int build_request(bufferlist& bl) const;
   int handle_response(int ret, bufferlist::iterator& iter);
-  bool build_rotating_request(bufferlist& bl);
+  bool build_rotating_request(bufferlist& bl) const;
 
-  int get_protocol() { return CEPH_AUTH_CEPHX; }
-  
-  void tick() {}
+  int get_protocol() const { return CEPH_AUTH_CEPHX; }
 
-  AuthAuthorizer *build_authorizer(uint32_t service_id);
+  AuthAuthorizer *build_authorizer(uint32_t service_id) const;
 
-  void validate_tickets();
   bool need_tickets();
 
   void set_global_id(uint64_t id) {
+    RWLock::WLocker l(lock);
     global_id = id;
     tickets.global_id = id;
   }
+private:
+  void validate_tickets();
 };
 
 #endif
diff --git a/src/auth/cephx/CephxProtocol.cc b/src/auth/cephx/CephxProtocol.cc
index 6956f45..f57f063 100644
--- a/src/auth/cephx/CephxProtocol.cc
+++ b/src/auth/cephx/CephxProtocol.cc
@@ -197,7 +197,7 @@ bool CephXTicketHandler::have_key()
   return have_key_flag;
 }
 
-bool CephXTicketHandler::need_key()
+bool CephXTicketHandler::need_key() const
 {
   if (have_key_flag) {
     return (!expires.is_zero()) && (ceph_clock_now(cct) >= renew_after);
@@ -214,9 +214,9 @@ bool CephXTicketManager::have_key(uint32_t service_id)
   return iter->second.have_key();
 }
 
-bool CephXTicketManager::need_key(uint32_t service_id)
+bool CephXTicketManager::need_key(uint32_t service_id) const
 {
-  map<uint32_t, CephXTicketHandler>::iterator iter = tickets_map.find(service_id);
+  map<uint32_t, CephXTicketHandler>::const_iterator iter = tickets_map.find(service_id);
   if (iter == tickets_map.end())
     return true;
   return iter->second.need_key();
@@ -290,7 +290,7 @@ bool CephXTicketManager::verify_service_ticket_reply(CryptoKey& secret,
  *
  * ticket, {timestamp}^session_key
  */
-CephXAuthorizer *CephXTicketHandler::build_authorizer(uint64_t global_id)
+CephXAuthorizer *CephXTicketHandler::build_authorizer(uint64_t global_id) const
 {
   CephXAuthorizer *a = new CephXAuthorizer(cct);
   a->session_key = session_key;
@@ -320,16 +320,16 @@ CephXAuthorizer *CephXTicketHandler::build_authorizer(uint64_t global_id)
  *
  * ticket, {timestamp}^session_key
  */
-CephXAuthorizer *CephXTicketManager::build_authorizer(uint32_t service_id)
+CephXAuthorizer *CephXTicketManager::build_authorizer(uint32_t service_id) const
 {
-  map<uint32_t, CephXTicketHandler>::iterator iter = tickets_map.find(service_id);
+  map<uint32_t, CephXTicketHandler>::const_iterator iter = tickets_map.find(service_id);
   if (iter == tickets_map.end()) {
     ldout(cct, 0) << "no TicketHandler for service "
 		  << ceph_entity_type_name(service_id) << dendl;
     return NULL;
   }
 
-  CephXTicketHandler& handler = iter->second;
+  const CephXTicketHandler& handler = iter->second;
   return handler.build_authorizer(global_id);
 }
 
diff --git a/src/auth/cephx/CephxProtocol.h b/src/auth/cephx/CephxProtocol.h
index 19f4f23..8a3e094 100644
--- a/src/auth/cephx/CephxProtocol.h
+++ b/src/auth/cephx/CephxProtocol.h
@@ -303,10 +303,10 @@ struct CephXTicketHandler {
   bool verify_service_ticket_reply(CryptoKey& principal_secret,
 				 bufferlist::iterator& indata);
   // to access the service
-  CephXAuthorizer *build_authorizer(uint64_t global_id);
+  CephXAuthorizer *build_authorizer(uint64_t global_id) const;
 
   bool have_key();
-  bool need_key();
+  bool need_key() const;
 
   void invalidate_ticket() {
     have_key_flag = 0;
@@ -335,9 +335,9 @@ struct CephXTicketManager {
     assert(res.second);
     return res.first->second;
   }
-  CephXAuthorizer *build_authorizer(uint32_t service_id);
+  CephXAuthorizer *build_authorizer(uint32_t service_id) const;
   bool have_key(uint32_t service_id);
-  bool need_key(uint32_t service_id);
+  bool need_key(uint32_t service_id) const;
   void set_have_need_key(uint32_t service_id, uint32_t& have, uint32_t& need);
   void validate_tickets(uint32_t mask, uint32_t& have, uint32_t& need);
   void invalidate_ticket(uint32_t service_id);
diff --git a/src/auth/none/AuthNoneClientHandler.h b/src/auth/none/AuthNoneClientHandler.h
index 61c1db9..203687e 100644
--- a/src/auth/none/AuthNoneClientHandler.h
+++ b/src/auth/none/AuthNoneClientHandler.h
@@ -27,15 +27,15 @@ public:
 
   void reset() { }
 
-  int build_request(bufferlist& bl) { return 0; }
+  void prepare_build_request() {}
+  int build_request(bufferlist& bl) const { return 0; }
   int handle_response(int ret, bufferlist::iterator& iter) { return 0; }
-  bool build_rotating_request(bufferlist& bl) { return false; }
+  bool build_rotating_request(bufferlist& bl) const { return false; }
 
-  int get_protocol() { return CEPH_AUTH_NONE; }
+  int get_protocol() const { return CEPH_AUTH_NONE; }
   
-  void tick() {}
-
-  AuthAuthorizer *build_authorizer(uint32_t service_id) {
+  AuthAuthorizer *build_authorizer(uint32_t service_id) const {
+    RWLock::RLocker l(lock);
     AuthNoneAuthorizer *auth = new AuthNoneAuthorizer();
     if (auth) {
       auth->build_authorizer(cct->_conf->name, global_id);
@@ -43,10 +43,14 @@ public:
     return auth;
   }
 
-  void validate_tickets() { }
   bool need_tickets() { return false; }
 
-  void set_global_id(uint64_t id) { global_id = id; }
+  void set_global_id(uint64_t id) {
+    RWLock::WLocker l(lock);
+    global_id = id;
+  }
+private:
+  void validate_tickets() {}
 };
 
 #endif
diff --git a/src/auth/unknown/AuthUnknownClientHandler.h b/src/auth/unknown/AuthUnknownClientHandler.h
index b26b382..088b816 100644
--- a/src/auth/unknown/AuthUnknownClientHandler.h
+++ b/src/auth/unknown/AuthUnknownClientHandler.h
@@ -27,15 +27,15 @@ public:
 
   void reset() { }
 
-  int build_request(bufferlist& bl) { return 0; }
+  void prepare_build_request() {}
+  int build_request(bufferlist& bl) const { return 0; }
   int handle_response(int ret, bufferlist::iterator& iter) { return 0; }
-  bool build_rotating_request(bufferlist& bl) { return false; }
+  bool build_rotating_request(bufferlist& bl) const { return false; }
 
-  int get_protocol() { return CEPH_AUTH_UNKNOWN; }
+  int get_protocol() const { return CEPH_AUTH_UNKNOWN; }
   
-  void tick() {}
-
-  AuthAuthorizer *build_authorizer(uint32_t service_id) {
+  AuthAuthorizer *build_authorizer(uint32_t service_id) const {
+    RWLock::RLocker l(lock);
     AuthUnknownAuthorizer *auth = new AuthUnknownAuthorizer();
     if (auth) {
       auth->build_authorizer(cct->_conf->name, global_id);
@@ -43,10 +43,14 @@ public:
     return auth;
   }
 
-  void validate_tickets() { }
   bool need_tickets() { return false; }
 
-  void set_global_id(uint64_t id) { global_id = id; }
+  void set_global_id(uint64_t id) {
+    RWLock::WLocker l(lock);
+    global_id = id;
+  }
+private:
+  void validate_tickets() { }
 };
 
 #endif
diff --git a/src/brag/README.md b/src/brag/README.md
index 55af44f..574d7fd 100644
--- a/src/brag/README.md
+++ b/src/brag/README.md
@@ -24,7 +24,8 @@ Run 'ceph-brag -h' to get the usage information of this tool.
       "cluster_creation_date": "2014-01-16 13:38:41.928551",
       "uuid": "20679d0e-04b1-4004-8ee9-45ac271510e9",
       "components_count": {
-        "num_bytes": 0,
+        "num_data_bytes": 0,
+        "num_bytes_total": 1209312904,
         "num_osds": 1,
         "num_objects": 0,
         "num_pgs": 192,
diff --git a/src/brag/client/ceph-brag b/src/brag/client/ceph-brag
index e07ad01..91981e5 100755
--- a/src/brag/client/ceph-brag
+++ b/src/brag/client/ceph-brag
@@ -30,14 +30,6 @@ def get_uuid():
 
   return uid
 
-def get_cluster_creation_date():
-  (rc, o, e) = run_command(['ceph', 'mon', 'dump', '-f', 'json'])
-  if rc is not 0:
-    raise RuntimeError("\'ceph mon dump\' failed - " + e)
-
-  oj = json.loads(o)
-  return oj['created']
-
 def bytes_pretty_to_raw(byte_count, byte_scale):
   if byte_scale == 'kB':
     return byte_count >> 10
@@ -66,7 +58,8 @@ def get_nums():
 
   pgmap = oj['pgmap']
   num_pgs = pgmap['num_pgs']
-  num_bytes = pgmap['data_bytes']
+  num_data_bytes = pgmap['data_bytes']
+  num_bytes_total = pgmap['bytes_total']
 
   (rc, o, e) = run_command(['ceph', 'pg', 'dump', 'pools', '-f', 'json-pretty'])
   if rc is not 0:
@@ -82,7 +75,8 @@ def get_nums():
           'num_osds':num_osds,
           'num_mdss':num_mdss,
           'num_pgs':num_pgs,
-          'num_bytes':num_bytes,
+          'num_data_bytes':num_data_bytes,
+          'num_bytes_total':num_bytes_total,
           'num_pools':num_pools,
           'num_objects':num_objs}
   return nums
@@ -100,31 +94,23 @@ def get_crush_types():
   for t in crush_dump['types']:
     crush_types[t['type_id']] = t['name']
 
-  buckets = {}
-  items_list = []
+  types_list = []
   for bucket in crush_dump['buckets']:
-    buckets[bucket['id']] = bucket['type_id']
-    for item in bucket['items']:
-      items_list.append(item['id'])
+    types_list.append(bucket['type_id'])
 
   crush_map = []
-  counter = Counter(items_list)
+  types_counter = Counter(types_list)
   append = lambda t,c: crush_map.append({'type':t, 'count':c})
-  for id,count in counter.items():
-    if id in buckets:
-      append(crush_types[buckets[id]],
+  for id,count in types_counter.items():
+      append(crush_types[id],
              count)
-      del buckets[id]
-    else:
-      append(crush_types[id], count)
 
-  #the root item
-  for id,type_id in buckets.items():
-    append(crush_types[type_id], 1)
+  if 'devices' in crush_dump:
+    append('devices', len(crush_dump['devices']))
 
   return crush_map
 
-def get_pool_metadata():
+def get_osd_dump_info():
   (rc, o, e) = run_command(['ceph', 'osd', 'dump', '-f', 'json'])
   if rc is not 0:
     raise RuntimeError("\'ceph osd dump\' failed - " + e)
@@ -135,7 +121,7 @@ def get_pool_metadata():
   for p in oj['pools']:
     pool_meta.append(proc(p))
 
-  return pool_meta
+  return oj['created'], pool_meta
 
 def get_sysinfo(max_osds):
   count = 0
@@ -218,12 +204,11 @@ def output_json():
   url = None
   
   out['uuid'] = get_uuid()
-  out['cluster_creation_date'] = get_cluster_creation_date()
   nums = get_nums()
   num_osds = int(nums['num_osds'])
   out['components_count'] = nums
   out['crush_types'] = get_crush_types()
-  out['pool_metadata'] = get_pool_metadata()
+  out['cluster_creation_date'], out['pool_metadata'] = get_osd_dump_info()
   out['sysinfo'] = get_sysinfo(num_osds)
 
   owner = get_ownership_info()
diff --git a/src/brag/server/ceph_brag/json.py b/src/brag/server/ceph_brag/json.py
index bc46702..7f36eb6 100644
--- a/src/brag/server/ceph_brag/json.py
+++ b/src/brag/server/ceph_brag/json.py
@@ -22,7 +22,8 @@ def jsonify_cluster_info(ci):
 @jsonify.register(db.components_info)
 def jsonify_components_info(comps):
     return dict(
-            num_bytes=comps.num_bytes,
+            num_data_bytes=comps.num_data_bytes,
+            num_bytes_total=comps.num_bytes_total,
             num_osds=comps.num_osds,
             num_objects=comps.num_objects,
             num_pgs=comps.num_pgs,
diff --git a/src/brag/server/ceph_brag/model/db.py b/src/brag/server/ceph_brag/model/db.py
index 94d98ff..5dfc745 100644
--- a/src/brag/server/ceph_brag/model/db.py
+++ b/src/brag/server/ceph_brag/model/db.py
@@ -35,7 +35,8 @@ class components_info(Base):
 
   index = Column(Integer, primary_key=True)
   vid = Column(ForeignKey('version_info.index'))
-  num_bytes = Column(BigInteger)
+  num_data_bytes = Column(BigInteger)
+  num_bytes_total = Column(BigInteger)
   num_osds = Column(Integer)
   num_objects = Column(Integer)
   num_pgs = Column(Integer)
@@ -168,7 +169,8 @@ def put_new_version(data):
   def add_components_info(vi):
     comps_count= info['components_count']
     comps_info = components_info(vid=vi.index,
-                         num_bytes=comps_count['num_bytes'],
+                         num_data_bytes=comps_count['num_data_bytes'],
+                         num_bytes_total=comps_count['num_bytes_total'],
                          num_osds=comps_count['num_osds'],
                          num_objects=comps_count['num_objects'],
                          num_pgs=comps_count['num_pgs'],
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 0aa6b20..4e84b4d 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -108,7 +108,7 @@ int mon_data_exists(bool *r)
     if (errno == ENOENT) {
       *r = false;
     } else {
-      cerr << "stat(" << mon_data << ") " << strerror(errno) << std::endl;
+      cerr << "stat(" << mon_data << ") " << cpp_strerror(errno) << std::endl;
       return -errno;
     }
   } else {
@@ -123,7 +123,7 @@ int mon_data_empty(bool *r)
 
   DIR *dir = ::opendir(mon_data.c_str());
   if (!dir) {
-    cerr << "opendir(" << mon_data << ") " << strerror(errno) << std::endl;
+    cerr << "opendir(" << mon_data << ") " << cpp_strerror(errno) << std::endl;
     return -errno;
   }
   char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
@@ -135,7 +135,7 @@ int mon_data_empty(bool *r)
   while (!::readdir_r(dir, reinterpret_cast<struct dirent*>(buf), &de)) {
     if (!de) {
       if (errno) {
-	cerr << "readdir(" << mon_data << ") " << strerror(errno) << std::endl;
+	cerr << "readdir(" << mon_data << ") " << cpp_strerror(errno) << std::endl;
 	code = -errno;
       }
       break;
@@ -285,7 +285,7 @@ int main(int argc, const char **argv)
     if (!exists) {
       if (::mkdir(g_conf->mon_data.c_str(), 0755)) {
 	cerr << "mkdir(" << g_conf->mon_data << ") : "
-	     << strerror(errno) << std::endl;
+	     << cpp_strerror(errno) << std::endl;
 	exit(1);
       }
     }
diff --git a/src/cephfs.cc b/src/cephfs.cc
index f25d02a..90aee32 100644
--- a/src/cephfs.cc
+++ b/src/cephfs.cc
@@ -65,8 +65,7 @@ int main (int argc, char **argv) {
     memset(&layout, 0, sizeof(layout));
     err = ioctl(fd, CEPH_IOC_GET_LAYOUT, (unsigned long)&layout);
     if (err) {
-      cerr << "Error getting layout: "
-	   << (err == -1 ? strerror(errno) : strerror(-err)) << endl;
+      cerr << "Error getting layout: " << cpp_strerror(errno) << endl;
       return 1;
     }
     if (layout.stripe_unit == 0) {
@@ -82,8 +81,7 @@ int main (int argc, char **argv) {
     location.file_offset = file_offset;
     err = ioctl(fd, CEPH_IOC_GET_DATALOC, (unsigned long)&location);
     if (err) {
-      cerr << "Error getting location: "
-	   << (err == -1 ? strerror(errno) : strerror(-err)) << endl;
+      cerr << "Error getting location: " << cpp_strerror(err) << endl;
       return 1;
     }
     cout << "location.file_offset:  " << location.file_offset << endl;
@@ -106,16 +104,14 @@ int main (int argc, char **argv) {
     layout.unused = -1;   /* used to be preferred_osd */
     err = ioctl(fd, ioctl_num, (unsigned long)&layout);
     if (err) {
-      cerr << "Error setting layout: " 
-	   << (err == -1 ? strerror(errno) : strerror(-err)) << endl;
+      cerr << "Error setting layout: " << cpp_strerror(errno) << endl;
       return 1;
     }
   } else if (CMD_MAP == cmd) {
     struct stat st;
     err = ::fstat(fd, &st);
     if (err < 0) {
-      cerr << "error statting file: "
-	   << (err == -1 ? strerror(errno) : strerror(-err)) << endl;
+      cerr << "error statting file: " << cpp_strerror(errno) << endl;
       return 1;
     }
 
@@ -123,8 +119,7 @@ int main (int argc, char **argv) {
     memset(&layout, 0, sizeof(layout));
     err = ioctl(fd, CEPH_IOC_GET_LAYOUT, (unsigned long)&layout);
     if (err) {
-      cerr << "Error getting layout: "
-	   << (err == -1 ? strerror(errno) : strerror(-err)) << endl;
+      cerr << "Error getting layout: " << cpp_strerror(errno) << endl;
       return 1;
     }
 
@@ -136,8 +131,7 @@ int main (int argc, char **argv) {
       location.file_offset = off;
       err = ioctl(fd, CEPH_IOC_GET_DATALOC, (unsigned long)&location);
       if (err) {
-	cerr << "Error getting location: "
-	     << (err == -1 ? strerror(errno) : strerror(-err)) << endl;
+	cerr << "Error getting location: " << cpp_strerror(errno) << endl;
 	return 1;
       }
       printf("%15lld  %24s  %12lld  %12lld  %d\n",
@@ -193,7 +187,7 @@ int init_options(int argc, char **argv, int *fd, char **path, int *cmd,
 
   *fd = open(argv[1], O_RDONLY);
   if (*fd < 0) {
-    cerr << "error opening path: " << strerror(*fd) << endl;
+    cerr << "error opening path: " << cpp_strerror(*fd) << endl;
     return 1;
   }
 
diff --git a/src/civetweb/civetweb.h b/src/civetweb/civetweb.h
index b8be9c4..a6ca3e7 100644
--- a/src/civetweb/civetweb.h
+++ b/src/civetweb/civetweb.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2004-2013 Sergey Lyubka
+/* Copyright (c) 2013-2014 the Civetweb developers
+ * Copyright (c) 2004-2013 Sergey Lyubka
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +27,20 @@
 #define CIVETWEB_VERSION "1.6"
 #endif
 
+#ifndef CIVETWEB_API
+    #if defined(_WIN32)
+        #if defined(CIVETWEB_DLL_EXPORTS)
+            #define CIVETWEB_API __declspec(dllexport)
+        #elif defined(CIVETWEB_DLL_IMPORTS)
+            #define CIVETWEB_API __declspec(dllimport)
+        #else
+            #define CIVETWEB_API
+        #endif
+    #else
+        #define CIVETWEB_API
+    #endif
+#endif
+
 #include <stdio.h>
 #include <stddef.h>
 
@@ -140,6 +155,7 @@ struct mg_callbacks {
     int  (*http_error)(struct mg_connection *, int status);
 };
 
+
 /* Start web server.
 
    Parameters:
@@ -165,7 +181,7 @@ struct mg_callbacks {
 
    Return:
      web server context, or NULL on error. */
-struct mg_context *mg_start(const struct mg_callbacks *callbacks,
+CIVETWEB_API struct mg_context *mg_start(const struct mg_callbacks *callbacks,
                             void *user_data,
                             const char **configuration_options);
 
@@ -175,7 +191,8 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
    Must be called last, when an application wants to stop the web server and
    release all associated resources. This function blocks until all Civetweb
    threads are stopped. Context pointer becomes invalid. */
-void mg_stop(struct mg_context *);
+CIVETWEB_API void mg_stop(struct mg_context *);
+
 
 /* mg_request_handler
 
@@ -190,6 +207,7 @@ void mg_stop(struct mg_context *);
       1: the handler processed the request. */
 typedef int (* mg_request_handler)(struct mg_connection *conn, void *cbdata);
 
+
 /* mg_set_request_handler
 
    Sets or removes a URI mapping for a request handler.
@@ -206,7 +224,7 @@ typedef int (* mg_request_handler)(struct mg_connection *conn, void *cbdata);
       handler: the callback handler to use when the URI is requested.
                If NULL, the URI will be removed.
       cbdata: the callback data to give to the handler when it s requested. */
-void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_handler handler, void *cbdata);
+CIVETWEB_API void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_handler handler, void *cbdata);
 
 
 /* Get the value of particular configuration parameter.
@@ -215,14 +233,41 @@ void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_
    If given parameter name is not valid, NULL is returned. For valid
    names, return value is guaranteed to be non-NULL. If parameter is not
    set, zero-length string is returned. */
-const char *mg_get_option(const struct mg_context *ctx, const char *name);
+CIVETWEB_API const char *mg_get_option(const struct mg_context *ctx, const char *name);
 
 
+#if defined(MG_LEGACY_INTERFACE)
 /* Return array of strings that represent valid configuration options.
    For each option, option name and default value is returned, i.e. the
    number of entries in the array equals to number_of_options x 2.
    Array is NULL terminated. */
-const char **mg_get_valid_option_names(void);
+/* Deprecated: Use mg_get_valid_options instead. */
+CIVETWEB_API const char **mg_get_valid_option_names(void);
+#endif
+
+
+struct mg_option {
+    const char * name;
+    int type;
+    const char * default_value;
+};
+
+enum {
+    CONFIG_TYPE_UNKNOWN = 0x0,
+    CONFIG_TYPE_NUMBER = 0x1,
+    CONFIG_TYPE_STRING = 0x2,
+    CONFIG_TYPE_FILE = 0x3,
+    CONFIG_TYPE_DIRECTORY = 0x4,
+    CONFIG_TYPE_BOOLEAN = 0x5,
+    CONFIG_TYPE_EXT_PATTERN = 0x6
+};
+
+
+/* Return array of struct mg_option, representing all valid configuration
+   options of civetweb.c.
+   The array is terminated by a NULL name option. */
+CIVETWEB_API const struct mg_option *mg_get_valid_options(void);
+
 
 /* Get the list of ports that civetweb is listening on.
    size is the size of the ports int array and ssl int array to fill.
@@ -231,7 +276,8 @@ const char **mg_get_valid_option_names(void);
    Return value is the number of ports and ssl information filled in.
    The value returned is read-only. Civetweb does not allow changing
    configuration at run time. */
-size_t mg_get_ports(const struct mg_context *ctx, size_t size, int* ports, int* ssl);
+CIVETWEB_API size_t mg_get_ports(const struct mg_context *ctx, size_t size, int* ports, int* ssl);
+
 
 /* Add, edit or delete the entry in the passwords file.
 
@@ -245,14 +291,14 @@ size_t mg_get_ports(const struct mg_context *ctx, size_t size, int* ports, int*
 
    Return:
      1 on success, 0 on error. */
-int mg_modify_passwords_file(const char *passwords_file_name,
-                             const char *domain,
-                             const char *user,
-                             const char *password);
+CIVETWEB_API int mg_modify_passwords_file(const char *passwords_file_name,
+                                          const char *domain,
+                                          const char *user,
+                                          const char *password);
 
 
 /* Return information associated with the request. */
-struct mg_request_info *mg_get_request_info(struct mg_connection *);
+CIVETWEB_API struct mg_request_info *mg_get_request_info(struct mg_connection *);
 
 
 /* Send data to the client.
@@ -260,7 +306,7 @@ struct mg_request_info *mg_get_request_info(struct mg_connection *);
     0   when the connection has been closed
     -1  on error
     >0  number of bytes written on success */
-int mg_write(struct mg_connection *, const void *buf, size_t len);
+CIVETWEB_API int mg_write(struct mg_connection *, const void *buf, size_t len);
 
 
 /* Send data to a websocket client wrapped in a websocket frame.  Uses mg_lock
@@ -275,16 +321,18 @@ int mg_write(struct mg_connection *, const void *buf, size_t len);
     0   when the connection has been closed
     -1  on error
     >0  number of bytes written on success */
-int mg_websocket_write(struct mg_connection* conn, int opcode,
-                       const char *data, size_t data_len);
+CIVETWEB_API int mg_websocket_write(struct mg_connection* conn, int opcode,
+                                    const char *data, size_t data_len);
+
 
 /* Blocks until unique access is obtained to this connection. Intended for use
    with websockets only.
    Invoke this before mg_write or mg_printf when communicating with a
    websocket if your code has server-initiated communication as well as
    communication in direct response to a message. */
-void mg_lock(struct mg_connection* conn);
-void mg_unlock(struct mg_connection* conn);
+CIVETWEB_API void mg_lock(struct mg_connection* conn);
+CIVETWEB_API void mg_unlock(struct mg_connection* conn);
+
 
 /* Opcodes, from http://tools.ietf.org/html/rfc6455 */
 enum {
@@ -317,14 +365,13 @@ enum {
 #endif
 
 /* Send data to the client using printf() semantics.
-
    Works exactly like mg_write(), but allows to do message formatting. */
-int mg_printf(struct mg_connection *,
-              PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+CIVETWEB_API int mg_printf(struct mg_connection *,
+                           PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
 
 
 /* Send contents of the entire file together with HTTP headers. */
-void mg_send_file(struct mg_connection *conn, const char *path);
+CIVETWEB_API void mg_send_file(struct mg_connection *conn, const char *path);
 
 
 /* Read data from the remote end, return number of bytes read.
@@ -332,7 +379,7 @@ void mg_send_file(struct mg_connection *conn, const char *path);
      0     connection has been closed by peer. No more data could be read.
      < 0   read error. No more data could be read from the connection.
      > 0   number of bytes read into the buffer. */
-int mg_read(struct mg_connection *, void *buf, size_t len);
+CIVETWEB_API int mg_read(struct mg_connection *, void *buf, size_t len);
 
 
 /* Get the value of particular HTTP header.
@@ -340,7 +387,7 @@ int mg_read(struct mg_connection *, void *buf, size_t len);
    This is a helper function. It traverses request_info->http_headers array,
    and if the header is present in the array, returns its value. If it is
    not present, NULL is returned. */
-const char *mg_get_header(const struct mg_connection *, const char *name);
+CIVETWEB_API const char *mg_get_header(const struct mg_connection *, const char *name);
 
 
 /* Get a value of particular form variable.
@@ -362,8 +409,9 @@ const char *mg_get_header(const struct mg_connection *, const char *name);
 
    Destination buffer is guaranteed to be '\0' - terminated if it is not
    NULL or zero length. */
-int mg_get_var(const char *data, size_t data_len,
-               const char *var_name, char *dst, size_t dst_len);
+CIVETWEB_API int mg_get_var(const char *data, size_t data_len,
+                            const char *var_name, char *dst, size_t dst_len);
+
 
 /* Get a value of particular form variable.
 
@@ -388,8 +436,9 @@ int mg_get_var(const char *data, size_t data_len,
 
    Destination buffer is guaranteed to be '\0' - terminated if it is not
    NULL or zero length. */
-int mg_get_var2(const char *data, size_t data_len,
-                const char *var_name, char *dst, size_t dst_len, size_t occurrence);
+CIVETWEB_API int mg_get_var2(const char *data, size_t data_len,
+                             const char *var_name, char *dst, size_t dst_len, size_t occurrence);
+
 
 /* Fetch value of certain cookie variable into the destination buffer.
 
@@ -404,8 +453,8 @@ int mg_get_var2(const char *data, size_t data_len,
             parameter is not found).
         -2 (destination buffer is NULL, zero length or too small to hold the
             value). */
-int mg_get_cookie(const char *cookie, const char *var_name,
-                  char *buf, size_t buf_len);
+CIVETWEB_API int mg_get_cookie(const char *cookie, const char *var_name,
+                               char *buf, size_t buf_len);
 
 
 /* Download data from the remote web server.
@@ -423,35 +472,36 @@ int mg_get_cookie(const char *cookie, const char *var_name,
      conn = mg_download("google.com", 80, 0, ebuf, sizeof(ebuf),
                         "%s", "GET / HTTP/1.0\r\nHost: google.com\r\n\r\n");
  */
-struct mg_connection *mg_download(const char *host, int port, int use_ssl,
-                                  char *error_buffer, size_t error_buffer_size,
-                                  PRINTF_FORMAT_STRING(const char *request_fmt),
-                                  ...) PRINTF_ARGS(6, 7);
+CIVETWEB_API struct mg_connection *mg_download(const char *host, int port, int use_ssl,
+                                               char *error_buffer, size_t error_buffer_size,
+                                               PRINTF_FORMAT_STRING(const char *request_fmt),
+                                               ...) PRINTF_ARGS(6, 7);
 
 
 /* Close the connection opened by mg_download(). */
-void mg_close_connection(struct mg_connection *conn);
+CIVETWEB_API void mg_close_connection(struct mg_connection *conn);
 
 
 /* File upload functionality. Each uploaded file gets saved into a temporary
    file and MG_UPLOAD event is sent.
    Return number of uploaded files. */
-int mg_upload(struct mg_connection *conn, const char *destination_dir);
+CIVETWEB_API int mg_upload(struct mg_connection *conn, const char *destination_dir);
 
 
 /* Convenience function -- create detached thread.
    Return: 0 on success, non-0 on error. */
 typedef void * (*mg_thread_func_t)(void *);
-int mg_start_thread(mg_thread_func_t f, void *p);
+CIVETWEB_API int mg_start_thread(mg_thread_func_t f, void *p);
 
 
 /* Return builtin mime type for the given file name.
    For unrecognized extensions, "text/plain" is returned. */
-const char *mg_get_builtin_mime_type(const char *file_name);
+CIVETWEB_API const char *mg_get_builtin_mime_type(const char *file_name);
 
 
 /* Return Civetweb version. */
-const char *mg_version(void);
+CIVETWEB_API const char *mg_version(void);
+
 
 /* URL-decode input buffer into destination buffer.
    0-terminate the destination buffer.
@@ -459,13 +509,15 @@ const char *mg_version(void);
    uses '+' as character for space, see RFC 1866 section 8.2.1
    http://ftp.ics.uci.edu/pub/ietf/html/rfc1866.txt
    Return: length of the decoded data, or -1 if dst buffer is too small. */
-int mg_url_decode(const char *src, int src_len, char *dst,
-                  int dst_len, int is_form_url_encoded);
+CIVETWEB_API int mg_url_decode(const char *src, int src_len, char *dst,
+                               int dst_len, int is_form_url_encoded);
+
 
 /* URL-encode input buffer into destination buffer.
    returns the length of the resulting buffer or -1
    is the buffer is too small. */
-int mg_url_encode(const char *src, char *dst, size_t dst_len);
+CIVETWEB_API int mg_url_encode(const char *src, char *dst, size_t dst_len);
+
 
 /* MD5 hash given strings.
    Buffer 'buf' must be 33 bytes long. Varargs is a NULL terminated list of
@@ -473,7 +525,7 @@ int mg_url_encode(const char *src, char *dst, size_t dst_len);
    MD5 hash. Example:
      char buf[33];
      mg_md5(buf, "aa", "bb", NULL); */
-char *mg_md5(char buf[33], ...);
+CIVETWEB_API char *mg_md5(char buf[33], ...);
 
 
 /* Print error message to the opened error log stream.
@@ -483,11 +535,13 @@ char *mg_md5(char buf[33], ...);
      ...: variable argument list
    Example:
      mg_cry(conn,"i like %s", "logging"); */
-void mg_cry(struct mg_connection *conn,
-            PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+CIVETWEB_API void mg_cry(struct mg_connection *conn,
+                         PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+
 
 /* utility method to compare two buffers, case incensitive. */
-int mg_strncasecmp(const char *s1, const char *s2, size_t len);
+CIVETWEB_API int mg_strncasecmp(const char *s1, const char *s2, size_t len);
+
 
 #ifdef __cplusplus
 }
diff --git a/src/civetweb/include/civetweb.h b/src/civetweb/include/civetweb.h
index b8be9c4..a6ca3e7 100644
--- a/src/civetweb/include/civetweb.h
+++ b/src/civetweb/include/civetweb.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2004-2013 Sergey Lyubka
+/* Copyright (c) 2013-2014 the Civetweb developers
+ * Copyright (c) 2004-2013 Sergey Lyubka
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +27,20 @@
 #define CIVETWEB_VERSION "1.6"
 #endif
 
+#ifndef CIVETWEB_API
+    #if defined(_WIN32)
+        #if defined(CIVETWEB_DLL_EXPORTS)
+            #define CIVETWEB_API __declspec(dllexport)
+        #elif defined(CIVETWEB_DLL_IMPORTS)
+            #define CIVETWEB_API __declspec(dllimport)
+        #else
+            #define CIVETWEB_API
+        #endif
+    #else
+        #define CIVETWEB_API
+    #endif
+#endif
+
 #include <stdio.h>
 #include <stddef.h>
 
@@ -140,6 +155,7 @@ struct mg_callbacks {
     int  (*http_error)(struct mg_connection *, int status);
 };
 
+
 /* Start web server.
 
    Parameters:
@@ -165,7 +181,7 @@ struct mg_callbacks {
 
    Return:
      web server context, or NULL on error. */
-struct mg_context *mg_start(const struct mg_callbacks *callbacks,
+CIVETWEB_API struct mg_context *mg_start(const struct mg_callbacks *callbacks,
                             void *user_data,
                             const char **configuration_options);
 
@@ -175,7 +191,8 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
    Must be called last, when an application wants to stop the web server and
    release all associated resources. This function blocks until all Civetweb
    threads are stopped. Context pointer becomes invalid. */
-void mg_stop(struct mg_context *);
+CIVETWEB_API void mg_stop(struct mg_context *);
+
 
 /* mg_request_handler
 
@@ -190,6 +207,7 @@ void mg_stop(struct mg_context *);
       1: the handler processed the request. */
 typedef int (* mg_request_handler)(struct mg_connection *conn, void *cbdata);
 
+
 /* mg_set_request_handler
 
    Sets or removes a URI mapping for a request handler.
@@ -206,7 +224,7 @@ typedef int (* mg_request_handler)(struct mg_connection *conn, void *cbdata);
       handler: the callback handler to use when the URI is requested.
                If NULL, the URI will be removed.
       cbdata: the callback data to give to the handler when it s requested. */
-void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_handler handler, void *cbdata);
+CIVETWEB_API void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_handler handler, void *cbdata);
 
 
 /* Get the value of particular configuration parameter.
@@ -215,14 +233,41 @@ void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_
    If given parameter name is not valid, NULL is returned. For valid
    names, return value is guaranteed to be non-NULL. If parameter is not
    set, zero-length string is returned. */
-const char *mg_get_option(const struct mg_context *ctx, const char *name);
+CIVETWEB_API const char *mg_get_option(const struct mg_context *ctx, const char *name);
 
 
+#if defined(MG_LEGACY_INTERFACE)
 /* Return array of strings that represent valid configuration options.
    For each option, option name and default value is returned, i.e. the
    number of entries in the array equals to number_of_options x 2.
    Array is NULL terminated. */
-const char **mg_get_valid_option_names(void);
+/* Deprecated: Use mg_get_valid_options instead. */
+CIVETWEB_API const char **mg_get_valid_option_names(void);
+#endif
+
+
+struct mg_option {
+    const char * name;
+    int type;
+    const char * default_value;
+};
+
+enum {
+    CONFIG_TYPE_UNKNOWN = 0x0,
+    CONFIG_TYPE_NUMBER = 0x1,
+    CONFIG_TYPE_STRING = 0x2,
+    CONFIG_TYPE_FILE = 0x3,
+    CONFIG_TYPE_DIRECTORY = 0x4,
+    CONFIG_TYPE_BOOLEAN = 0x5,
+    CONFIG_TYPE_EXT_PATTERN = 0x6
+};
+
+
+/* Return array of struct mg_option, representing all valid configuration
+   options of civetweb.c.
+   The array is terminated by a NULL name option. */
+CIVETWEB_API const struct mg_option *mg_get_valid_options(void);
+
 
 /* Get the list of ports that civetweb is listening on.
    size is the size of the ports int array and ssl int array to fill.
@@ -231,7 +276,8 @@ const char **mg_get_valid_option_names(void);
    Return value is the number of ports and ssl information filled in.
    The value returned is read-only. Civetweb does not allow changing
    configuration at run time. */
-size_t mg_get_ports(const struct mg_context *ctx, size_t size, int* ports, int* ssl);
+CIVETWEB_API size_t mg_get_ports(const struct mg_context *ctx, size_t size, int* ports, int* ssl);
+
 
 /* Add, edit or delete the entry in the passwords file.
 
@@ -245,14 +291,14 @@ size_t mg_get_ports(const struct mg_context *ctx, size_t size, int* ports, int*
 
    Return:
      1 on success, 0 on error. */
-int mg_modify_passwords_file(const char *passwords_file_name,
-                             const char *domain,
-                             const char *user,
-                             const char *password);
+CIVETWEB_API int mg_modify_passwords_file(const char *passwords_file_name,
+                                          const char *domain,
+                                          const char *user,
+                                          const char *password);
 
 
 /* Return information associated with the request. */
-struct mg_request_info *mg_get_request_info(struct mg_connection *);
+CIVETWEB_API struct mg_request_info *mg_get_request_info(struct mg_connection *);
 
 
 /* Send data to the client.
@@ -260,7 +306,7 @@ struct mg_request_info *mg_get_request_info(struct mg_connection *);
     0   when the connection has been closed
     -1  on error
     >0  number of bytes written on success */
-int mg_write(struct mg_connection *, const void *buf, size_t len);
+CIVETWEB_API int mg_write(struct mg_connection *, const void *buf, size_t len);
 
 
 /* Send data to a websocket client wrapped in a websocket frame.  Uses mg_lock
@@ -275,16 +321,18 @@ int mg_write(struct mg_connection *, const void *buf, size_t len);
     0   when the connection has been closed
     -1  on error
     >0  number of bytes written on success */
-int mg_websocket_write(struct mg_connection* conn, int opcode,
-                       const char *data, size_t data_len);
+CIVETWEB_API int mg_websocket_write(struct mg_connection* conn, int opcode,
+                                    const char *data, size_t data_len);
+
 
 /* Blocks until unique access is obtained to this connection. Intended for use
    with websockets only.
    Invoke this before mg_write or mg_printf when communicating with a
    websocket if your code has server-initiated communication as well as
    communication in direct response to a message. */
-void mg_lock(struct mg_connection* conn);
-void mg_unlock(struct mg_connection* conn);
+CIVETWEB_API void mg_lock(struct mg_connection* conn);
+CIVETWEB_API void mg_unlock(struct mg_connection* conn);
+
 
 /* Opcodes, from http://tools.ietf.org/html/rfc6455 */
 enum {
@@ -317,14 +365,13 @@ enum {
 #endif
 
 /* Send data to the client using printf() semantics.
-
    Works exactly like mg_write(), but allows to do message formatting. */
-int mg_printf(struct mg_connection *,
-              PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+CIVETWEB_API int mg_printf(struct mg_connection *,
+                           PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
 
 
 /* Send contents of the entire file together with HTTP headers. */
-void mg_send_file(struct mg_connection *conn, const char *path);
+CIVETWEB_API void mg_send_file(struct mg_connection *conn, const char *path);
 
 
 /* Read data from the remote end, return number of bytes read.
@@ -332,7 +379,7 @@ void mg_send_file(struct mg_connection *conn, const char *path);
      0     connection has been closed by peer. No more data could be read.
      < 0   read error. No more data could be read from the connection.
      > 0   number of bytes read into the buffer. */
-int mg_read(struct mg_connection *, void *buf, size_t len);
+CIVETWEB_API int mg_read(struct mg_connection *, void *buf, size_t len);
 
 
 /* Get the value of particular HTTP header.
@@ -340,7 +387,7 @@ int mg_read(struct mg_connection *, void *buf, size_t len);
    This is a helper function. It traverses request_info->http_headers array,
    and if the header is present in the array, returns its value. If it is
    not present, NULL is returned. */
-const char *mg_get_header(const struct mg_connection *, const char *name);
+CIVETWEB_API const char *mg_get_header(const struct mg_connection *, const char *name);
 
 
 /* Get a value of particular form variable.
@@ -362,8 +409,9 @@ const char *mg_get_header(const struct mg_connection *, const char *name);
 
    Destination buffer is guaranteed to be '\0' - terminated if it is not
    NULL or zero length. */
-int mg_get_var(const char *data, size_t data_len,
-               const char *var_name, char *dst, size_t dst_len);
+CIVETWEB_API int mg_get_var(const char *data, size_t data_len,
+                            const char *var_name, char *dst, size_t dst_len);
+
 
 /* Get a value of particular form variable.
 
@@ -388,8 +436,9 @@ int mg_get_var(const char *data, size_t data_len,
 
    Destination buffer is guaranteed to be '\0' - terminated if it is not
    NULL or zero length. */
-int mg_get_var2(const char *data, size_t data_len,
-                const char *var_name, char *dst, size_t dst_len, size_t occurrence);
+CIVETWEB_API int mg_get_var2(const char *data, size_t data_len,
+                             const char *var_name, char *dst, size_t dst_len, size_t occurrence);
+
 
 /* Fetch value of certain cookie variable into the destination buffer.
 
@@ -404,8 +453,8 @@ int mg_get_var2(const char *data, size_t data_len,
             parameter is not found).
         -2 (destination buffer is NULL, zero length or too small to hold the
             value). */
-int mg_get_cookie(const char *cookie, const char *var_name,
-                  char *buf, size_t buf_len);
+CIVETWEB_API int mg_get_cookie(const char *cookie, const char *var_name,
+                               char *buf, size_t buf_len);
 
 
 /* Download data from the remote web server.
@@ -423,35 +472,36 @@ int mg_get_cookie(const char *cookie, const char *var_name,
      conn = mg_download("google.com", 80, 0, ebuf, sizeof(ebuf),
                         "%s", "GET / HTTP/1.0\r\nHost: google.com\r\n\r\n");
  */
-struct mg_connection *mg_download(const char *host, int port, int use_ssl,
-                                  char *error_buffer, size_t error_buffer_size,
-                                  PRINTF_FORMAT_STRING(const char *request_fmt),
-                                  ...) PRINTF_ARGS(6, 7);
+CIVETWEB_API struct mg_connection *mg_download(const char *host, int port, int use_ssl,
+                                               char *error_buffer, size_t error_buffer_size,
+                                               PRINTF_FORMAT_STRING(const char *request_fmt),
+                                               ...) PRINTF_ARGS(6, 7);
 
 
 /* Close the connection opened by mg_download(). */
-void mg_close_connection(struct mg_connection *conn);
+CIVETWEB_API void mg_close_connection(struct mg_connection *conn);
 
 
 /* File upload functionality. Each uploaded file gets saved into a temporary
    file and MG_UPLOAD event is sent.
    Return number of uploaded files. */
-int mg_upload(struct mg_connection *conn, const char *destination_dir);
+CIVETWEB_API int mg_upload(struct mg_connection *conn, const char *destination_dir);
 
 
 /* Convenience function -- create detached thread.
    Return: 0 on success, non-0 on error. */
 typedef void * (*mg_thread_func_t)(void *);
-int mg_start_thread(mg_thread_func_t f, void *p);
+CIVETWEB_API int mg_start_thread(mg_thread_func_t f, void *p);
 
 
 /* Return builtin mime type for the given file name.
    For unrecognized extensions, "text/plain" is returned. */
-const char *mg_get_builtin_mime_type(const char *file_name);
+CIVETWEB_API const char *mg_get_builtin_mime_type(const char *file_name);
 
 
 /* Return Civetweb version. */
-const char *mg_version(void);
+CIVETWEB_API const char *mg_version(void);
+
 
 /* URL-decode input buffer into destination buffer.
    0-terminate the destination buffer.
@@ -459,13 +509,15 @@ const char *mg_version(void);
    uses '+' as character for space, see RFC 1866 section 8.2.1
    http://ftp.ics.uci.edu/pub/ietf/html/rfc1866.txt
    Return: length of the decoded data, or -1 if dst buffer is too small. */
-int mg_url_decode(const char *src, int src_len, char *dst,
-                  int dst_len, int is_form_url_encoded);
+CIVETWEB_API int mg_url_decode(const char *src, int src_len, char *dst,
+                               int dst_len, int is_form_url_encoded);
+
 
 /* URL-encode input buffer into destination buffer.
    returns the length of the resulting buffer or -1
    is the buffer is too small. */
-int mg_url_encode(const char *src, char *dst, size_t dst_len);
+CIVETWEB_API int mg_url_encode(const char *src, char *dst, size_t dst_len);
+
 
 /* MD5 hash given strings.
    Buffer 'buf' must be 33 bytes long. Varargs is a NULL terminated list of
@@ -473,7 +525,7 @@ int mg_url_encode(const char *src, char *dst, size_t dst_len);
    MD5 hash. Example:
      char buf[33];
      mg_md5(buf, "aa", "bb", NULL); */
-char *mg_md5(char buf[33], ...);
+CIVETWEB_API char *mg_md5(char buf[33], ...);
 
 
 /* Print error message to the opened error log stream.
@@ -483,11 +535,13 @@ char *mg_md5(char buf[33], ...);
      ...: variable argument list
    Example:
      mg_cry(conn,"i like %s", "logging"); */
-void mg_cry(struct mg_connection *conn,
-            PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+CIVETWEB_API void mg_cry(struct mg_connection *conn,
+                         PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+
 
 /* utility method to compare two buffers, case incensitive. */
-int mg_strncasecmp(const char *s1, const char *s2, size_t len);
+CIVETWEB_API int mg_strncasecmp(const char *s1, const char *s2, size_t len);
+
 
 #ifdef __cplusplus
 }
diff --git a/src/civetweb/src/civetweb.c b/src/civetweb/src/civetweb.c
index b639f1f..4aa8a02 100644
--- a/src/civetweb/src/civetweb.c
+++ b/src/civetweb/src/civetweb.c
@@ -1,4 +1,5 @@
-/* Copyright (c) 2004-2013 Sergey Lyubka
+/* Copyright (c) 2013-2014 the Civetweb developers
+ * Copyright (c) 2004-2013 Sergey Lyubka
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -27,12 +28,19 @@
 #ifdef __linux__
 #define _XOPEN_SOURCE 600     /* For flockfile() on Linux */
 #endif
-#define _LARGEFILE_SOURCE     /* Enable 64-bit file offsets */
+#ifndef _LARGEFILE_SOURCE
+#define _LARGEFILE_SOURCE     /* For fseeko(), ftello() */
+#endif
+#ifndef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64  /* Use 64-bit file offsets by default */
+#endif
 #ifndef __STDC_FORMAT_MACROS
-# define __STDC_FORMAT_MACROS  /* <inttypes.h> wants this for C++ */
+#define __STDC_FORMAT_MACROS  /* <inttypes.h> wants this for C++ */
 #endif
+#ifndef __STDC_LIMIT_MACROS
 #define __STDC_LIMIT_MACROS   /* C++ wants that for INT64_MAX */
 #endif
+#endif
 
 #if defined (_MSC_VER)
 /* 'type cast' : conversion from 'int' to 'HANDLE' of greater size */
@@ -45,7 +53,7 @@
 
 /* Disable WIN32_LEAN_AND_MEAN.
    This makes windows.h always include winsock2.h */
-#if defined(WIN32_LEAN_AND_MEAN) && (_MSC_VER <= 1400)
+#if defined(WIN32_LEAN_AND_MEAN)
 #undef WIN32_LEAN_AND_MEAN
 #endif
 
@@ -81,7 +89,9 @@
 #include <stddef.h>
 #include <stdio.h>
 
+#ifndef MAX_WORKER_THREADS
 #define MAX_WORKER_THREADS 1024
+#endif
 
 #if defined(_WIN32) && !defined(__SYMBIAN32__) /* Windows specific */
 #if defined(_MSC_VER) && _MSC_VER <= 1400
@@ -188,14 +198,22 @@ typedef struct {
     pthread_t *waitingthreadhdls;  /* The thread handles. */
 } pthread_cond_t;
 
+#ifndef __clockid_t_defined
 typedef DWORD clockid_t;
+#endif
+#ifndef CLOCK_MONOTONIC
 #define CLOCK_MONOTONIC (1)
+#endif
+#ifndef CLOCK_REALTIME
 #define CLOCK_REALTIME  (2)
+#endif
 
+#ifndef _TIMESPEC_DEFINED
 struct timespec {
     time_t   tv_sec;        /* seconds */
     long     tv_nsec;       /* nanoseconds */
 };
+#endif
 
 #define pid_t HANDLE /* MINGW typedefs pid_t to int. Using #define here. */
 
@@ -249,10 +267,15 @@ struct pollfd {
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <sys/time.h>
+#include <sys/utsname.h>
 #include <stdint.h>
 #include <inttypes.h>
 #include <netdb.h>
 
+#if defined(ANDROID)
+typedef unsigned short int in_port_t;
+#endif
+
 #include <pwd.h>
 #include <unistd.h>
 #include <dirent.h>
@@ -297,6 +320,145 @@ typedef int SOCKET;
 #endif
 #define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
 
+#ifdef DEBUG_TRACE
+#undef DEBUG_TRACE
+#define DEBUG_TRACE(x)
+#else
+#if defined(DEBUG)
+#define DEBUG_TRACE(x) do { \
+  flockfile(stdout); \
+  printf("*** %lu.%p.%s.%d: ", \
+         (unsigned long) time(NULL), (void *) pthread_self(), \
+         __func__, __LINE__); \
+  printf x; \
+  putchar('\n'); \
+  fflush(stdout); \
+  funlockfile(stdout); \
+} while (0)
+#else
+#define DEBUG_TRACE(x)
+#endif /* DEBUG */
+#endif /* DEBUG_TRACE */
+
+#if defined(MEMORY_DEBUGGING)
+static unsigned long blockCount = 0;
+static unsigned long totalMemUsed = 0;
+
+static void * mg_malloc_ex(size_t size, const char * file, unsigned line) {
+
+    void * data = malloc(size + sizeof(size_t));
+    void * memory = 0;
+    char mallocStr[256];
+
+    if (data) {
+        *(size_t*)data = size;
+        totalMemUsed += size;
+        blockCount++;
+        memory = (void *)(((char*)data)+sizeof(size_t));
+    }
+
+    sprintf(mallocStr, "MEM: %p %5u alloc   %7u %4u --- %s:%u\n", memory, size, totalMemUsed, blockCount, file, line);
+#if defined(_WIN32)
+    OutputDebugStringA(mallocStr);
+#else
+    DEBUG_TRACE("%s", mallocStr);
+#endif
+
+    return memory;
+}
+
+static void * mg_calloc_ex(size_t count, size_t size, const char * file, unsigned line) {
+
+    void * data = mg_malloc_ex(size*count, file, line);
+    if (data) memset(data, 0, size);
+
+    return data;
+}
+
+static void mg_free_ex(void * memory, const char * file, unsigned line) {
+
+    char mallocStr[256];
+    void * data = (void *)(((char*)memory)-sizeof(size_t));
+    size_t size;
+
+    if (memory) {
+        size = *(size_t*)data;
+        totalMemUsed -= size;
+        blockCount--;
+        sprintf(mallocStr, "MEM: %p %5u free    %7u %4u --- %s:%u\n", memory, size, totalMemUsed, blockCount, file, line);
+#if defined(_WIN32)
+        OutputDebugStringA(mallocStr);
+#else
+        DEBUG_TRACE("%s", mallocStr);
+#endif
+
+        free(data);
+    }
+}
+
+static void * mg_realloc_ex(void * memory, size_t newsize, const char * file, unsigned line) {
+
+    char mallocStr[256];
+    void * data;
+    size_t oldsize;
+
+    if (newsize) {
+        if (memory) {
+            data = (void *)(((char*)memory)-sizeof(size_t));
+            oldsize = *(size_t*)data;
+            data = realloc(data, newsize+sizeof(size_t));
+            if (data) {
+                totalMemUsed -= oldsize;
+                sprintf(mallocStr, "MEM: %p %5u r-free  %7u %4u --- %s:%u\n", memory, oldsize, totalMemUsed, blockCount, file, line);
+#if defined(_WIN32)
+                OutputDebugStringA(mallocStr);
+#else
+                DEBUG_TRACE("%s", mallocStr);
+#endif
+                totalMemUsed += newsize;
+                sprintf(mallocStr, "MEM: %p %5u r-alloc %7u %4u --- %s:%u\n", memory, newsize, totalMemUsed, blockCount, file, line);
+#if defined(_WIN32)
+                OutputDebugStringA(mallocStr);
+#else
+                DEBUG_TRACE("%s", mallocStr);
+#endif
+                *(size_t*)data = newsize;
+                data = (void *)(((char*)data)+sizeof(size_t));
+            } else {
+#if defined(_WIN32)
+                OutputDebugStringA("MEM: realloc failed\n");
+#else
+                DEBUG_TRACE("MEM: realloc failed\n");
+#endif
+            }
+        } else {
+            data = mg_malloc_ex(newsize, file, line);
+        }
+    } else {
+        data = 0;
+        mg_free_ex(memory, file, line);
+    }
+
+    return data;
+}
+
+#define mg_malloc(a)      mg_malloc_ex(a, __FILE__, __LINE__)
+#define mg_calloc(a,b)    mg_calloc_ex(a, b, __FILE__, __LINE__)
+#define mg_realloc(a, b)  mg_realloc_ex(a, b, __FILE__, __LINE__)
+#define mg_free(a)        mg_free_ex(a, __FILE__, __LINE__)
+
+#else
+static __inline void * mg_malloc(size_t a)             {return malloc(a);}
+static __inline void * mg_calloc(size_t a, size_t b)   {return calloc(a, b);}
+static __inline void * mg_realloc(void * a, size_t b)  {return realloc(a, b);}
+static __inline void   mg_free(void * a)               {free(a);}
+#endif
+
+#define malloc  DO_NOT_USE_THIS_FUNCTION__USE_mg_malloc
+#define calloc  DO_NOT_USE_THIS_FUNCTION__USE_mg_calloc
+#define realloc DO_NOT_USE_THIS_FUNCTION__USE_mg_realloc
+#define free    DO_NOT_USE_THIS_FUNCTION__USE_mg_free
+
 #ifdef _WIN32
 static CRITICAL_SECTION global_log_file_lock;
 static DWORD pthread_self(void)
@@ -333,26 +495,6 @@ void *pthread_getspecific(pthread_key_t key)
 #define MD5_STATIC static
 #include "md5.h"
 
-#ifdef DEBUG_TRACE
-#undef DEBUG_TRACE
-#define DEBUG_TRACE(x)
-#else
-#if defined(DEBUG)
-#define DEBUG_TRACE(x) do { \
-  flockfile(stdout); \
-  printf("*** %lu.%p.%s.%d: ", \
-         (unsigned long) time(NULL), (void *) pthread_self(), \
-         __func__, __LINE__); \
-  printf x; \
-  putchar('\n'); \
-  fflush(stdout); \
-  funlockfile(stdout); \
-} while (0)
-#else
-#define DEBUG_TRACE(x)
-#endif /* DEBUG */
-#endif /* DEBUG_TRACE */
-
 /* Darwin prior to 7.0 and Win32 do not have socklen_t */
 #ifdef NO_SOCKLEN_T
 typedef int socklen_t;
@@ -528,55 +670,66 @@ enum {
     NUM_THREADS, RUN_AS_USER, REWRITE, HIDE_FILES, REQUEST_TIMEOUT,
 
 #if defined(USE_LUA)
-    LUA_SCRIPT_EXTENSIONS, LUA_SERVER_PAGE_EXTENSIONS,
+    LUA_PRELOAD_FILE, LUA_SCRIPT_EXTENSIONS, LUA_SERVER_PAGE_EXTENSIONS,
 #endif
 #if defined(USE_WEBSOCKET)
     WEBSOCKET_ROOT,
 #endif
+#if defined(USE_LUA) && defined(USE_WEBSOCKET)
+    LUA_WEBSOCKET_EXTENSIONS,
+#endif
+    ACCESS_CONTROL_ALLOW_ORIGIN, ERROR_PAGES,
 
     NUM_OPTIONS
 };
 
-static const char *config_options[] = {
-    "cgi_pattern", "**.cgi$|**.pl$|**.php$",
-    "cgi_environment", NULL,
-    "put_delete_auth_file", NULL,
-    "cgi_interpreter", NULL,
-    "protect_uri", NULL,
-    "authentication_domain", "mydomain.com",
-    "ssi_pattern", "**.shtml$|**.shtm$",
-    "throttle", NULL,
-    "access_log_file", NULL,
-    "enable_directory_listing", "yes",
-    "error_log_file", NULL,
-    "global_auth_file", NULL,
-    "index_files",
+/* TODO: replace 12345 by proper config types */
+static struct mg_option config_options[] = {
+    {"cgi_pattern",                 CONFIG_TYPE_EXT_PATTERN,   "**.cgi$|**.pl$|**.php$"},
+    {"cgi_environment",             CONFIG_TYPE_STRING,        NULL},
+    {"put_delete_auth_file",        CONFIG_TYPE_FILE,          NULL},
+    {"cgi_interpreter",             CONFIG_TYPE_FILE,          NULL},
+    {"protect_uri",                 12345,                     NULL},
+    {"authentication_domain",       CONFIG_TYPE_STRING,        "mydomain.com"},
+    {"ssi_pattern",                 CONFIG_TYPE_EXT_PATTERN,   "**.shtml$|**.shtm$"},
+    {"throttle",                    12345,                     NULL},
+    {"access_log_file",             CONFIG_TYPE_FILE,          NULL},
+    {"enable_directory_listing",    CONFIG_TYPE_BOOLEAN,       "yes"},
+    {"error_log_file",              CONFIG_TYPE_FILE,          NULL},
+    {"global_auth_file",            CONFIG_TYPE_FILE,          NULL},
+    {"index_files",                 12345,
 #ifdef USE_LUA
-    "index.html,index.htm,index.lp,index.lsp,index.lua,index.cgi,index.shtml,index.php",
+    "index.xhtml,index.html,index.htm,index.lp,index.lsp,index.lua,index.cgi,index.shtml,index.php"},
 #else
-    "index.html,index.htm,index.cgi,index.shtml,index.php",
+    "index.xhtml,index.html,index.htm,index.cgi,index.shtml,index.php"},
 #endif
-    "enable_keep_alive", "no",
-    "access_control_list", NULL,
-    "extra_mime_types", NULL,
-    "listening_ports", "8080",
-    "document_root",  NULL,
-    "ssl_certificate", NULL,
-    "num_threads", "50",
-    "run_as_user", NULL,
-    "url_rewrite_patterns", NULL,
-    "hide_files_patterns", NULL,
-    "request_timeout_ms", "30000",
+    {"enable_keep_alive",           CONFIG_TYPE_BOOLEAN,       "no"},
+    {"access_control_list",         12345,                     NULL},
+    {"extra_mime_types",            12345,                     NULL},
+    {"listening_ports",             12345,                     "8080"},
+    {"document_root",               CONFIG_TYPE_DIRECTORY,     NULL},
+    {"ssl_certificate",             CONFIG_TYPE_FILE,          NULL},
+    {"num_threads",                 CONFIG_TYPE_NUMBER,        "50"},
+    {"run_as_user",                 CONFIG_TYPE_STRING,        NULL},
+    {"url_rewrite_patterns",        12345,                     NULL},
+    {"hide_files_patterns",         12345,                     NULL},
+    {"request_timeout_ms",          CONFIG_TYPE_NUMBER,        "30000"},
 
 #if defined(USE_LUA)
-    "lua_script_pattern", "**.lua$",
-    "lua_server_page_pattern", "**.lp$|**.lsp$",
+    {"lua_preload_file",            CONFIG_TYPE_FILE,          NULL},
+    {"lua_script_pattern",          CONFIG_TYPE_EXT_PATTERN,   "**.lua$"},
+    {"lua_server_page_pattern",     CONFIG_TYPE_EXT_PATTERN,   "**.lp$|**.lsp$"},
 #endif
 #if defined(USE_WEBSOCKET)
-    "websocket_root", NULL,
+    {"websocket_root",              CONFIG_TYPE_DIRECTORY,     NULL},
 #endif
+#if defined(USE_LUA) && defined(USE_WEBSOCKET)
+    {"lua_websocket_pattern",       CONFIG_TYPE_EXT_PATTERN,   "**.lua$"},
+#endif
+    {"access_control_allow_origin", CONFIG_TYPE_STRING,        "*"},
+    {"error_pages",                 CONFIG_TYPE_DIRECTORY,     NULL},
 
-    NULL
+    {NULL, CONFIG_TYPE_UNKNOWN, NULL}
 };
 
 struct mg_request_handler_info {
@@ -613,8 +766,18 @@ struct mg_context {
     int workerthreadcount;     /* The amount of worker threads. */
     pthread_t *workerthreadids;/* The worker thread IDs. */
 
+    unsigned long start_time;  /* Server start time, used for authentication */
+    unsigned long nonce_count; /* Used nonces, used for authentication */
+
+    char *systemName;          /* What operating system is running */
+
     /* linked list of uri handlers */
     struct mg_request_handler_info *request_handlers;
+
+#if defined(USE_LUA) && defined(USE_WEBSOCKET)
+    /* linked list of shared lua websockets */
+    struct mg_shared_lua_websocket *shared_lua_websockets;
+#endif
 };
 
 struct mg_connection {
@@ -630,6 +793,7 @@ struct mg_connection {
     char *buf;                  /* Buffer for received data */
     char *path_info;            /* PATH_INFO part of the URL */
     int must_close;             /* 1 if connection must be closed */
+    int in_error_handler;       /* 1 if in handler for user defined error pages */
     int buf_size;               /* Buffer size */
     int request_len;            /* Size of the request + headers in a buffer */
     int data_len;               /* Total size of data in a buffer */
@@ -666,11 +830,27 @@ struct de {
 static int is_websocket_request(const struct mg_connection *conn);
 #endif
 
+#if defined(MG_LEGACY_INTERFACE)
 const char **mg_get_valid_option_names(void)
 {
+    static const char * data[2 * sizeof(config_options) / sizeof(config_options[0])] = {0};
+    int i;
+
+    for (i=0; config_options[i].name != NULL; i++) {
+        data[i * 2] = config_options[i].name;
+        data[i * 2 + 1] = config_options[i].default_value;
+    }
+
+    return data;
+}
+#endif
+
+const struct mg_option *mg_get_valid_options(void)
+{
     return config_options;
 }
 
+
 static int is_file_in_memory(struct mg_connection *conn, const char *path,
                              struct file *filep)
 {
@@ -713,12 +893,121 @@ static void mg_fclose(struct file *filep)
     }
 }
 
+static void mg_strlcpy(register char *dst, register const char *src, size_t n)
+{
+    for (; *src != '\0' && n > 1; n--) {
+        *dst++ = *src++;
+    }
+    *dst = '\0';
+}
+
+static int lowercase(const char *s)
+{
+    return tolower(* (const unsigned char *) s);
+}
+
+int mg_strncasecmp(const char *s1, const char *s2, size_t len)
+{
+    int diff = 0;
+
+    if (len > 0)
+        do {
+            diff = lowercase(s1++) - lowercase(s2++);
+        } while (diff == 0 && s1[-1] != '\0' && --len > 0);
+
+    return diff;
+}
+
+static int mg_strcasecmp(const char *s1, const char *s2)
+{
+    int diff;
+
+    do {
+        diff = lowercase(s1++) - lowercase(s2++);
+    } while (diff == 0 && s1[-1] != '\0');
+
+    return diff;
+}
+
+static char * mg_strndup(const char *ptr, size_t len)
+{
+    char *p;
+
+    if ((p = (char *) mg_malloc(len + 1)) != NULL) {
+        mg_strlcpy(p, ptr, len + 1);
+    }
+
+    return p;
+}
+
+static char * mg_strdup(const char *str)
+{
+    return mg_strndup(str, strlen(str));
+}
+
+static const char *mg_strcasestr(const char *big_str, const char *small_str)
+{
+    int i, big_len = (int)strlen(big_str), small_len = (int)strlen(small_str);
+
+    for (i = 0; i <= big_len - small_len; i++) {
+        if (mg_strncasecmp(big_str + i, small_str, small_len) == 0) {
+            return big_str + i;
+        }
+    }
+
+    return NULL;
+}
+
+/* Like snprintf(), but never returns negative value, or a value
+   that is larger than a supplied buffer.
+   Thanks to Adam Zeldis to pointing snprintf()-caused vulnerability
+   in his audit report. */
+static int mg_vsnprintf(struct mg_connection *conn, char *buf, size_t buflen,
+                        const char *fmt, va_list ap)
+{
+    int n;
+
+    if (buflen == 0)
+        return 0;
+
+    n = vsnprintf(buf, buflen, fmt, ap);
+
+    if (n < 0) {
+        mg_cry(conn, "vsnprintf error");
+        n = 0;
+    } else if (n >= (int) buflen) {
+        mg_cry(conn, "truncating vsnprintf buffer: [%.*s]",
+               n > 200 ? 200 : n, buf);
+        n = (int) buflen - 1;
+    }
+    buf[n] = '\0';
+
+    return n;
+}
+
+static int mg_snprintf(struct mg_connection *conn, char *buf, size_t buflen,
+                       PRINTF_FORMAT_STRING(const char *fmt), ...)
+PRINTF_ARGS(4, 5);
+
+static int mg_snprintf(struct mg_connection *conn, char *buf, size_t buflen,
+                       const char *fmt, ...)
+{
+    va_list ap;
+    int n;
+
+    va_start(ap, fmt);
+    n = mg_vsnprintf(conn, buf, buflen, fmt, ap);
+    va_end(ap);
+
+    return n;
+}
+
 static int get_option_index(const char *name)
 {
     int i;
 
-    for (i = 0; config_options[i * 2] != NULL; i++) {
-        if (strcmp(config_options[i * 2], name) == 0) {
+    for (i = 0; config_options[i].name != NULL; i++) {
+        if (strcmp(config_options[i].name, name) == 0) {
             return i;
         }
     }
@@ -758,7 +1047,7 @@ static void sockaddr_to_string(char *buf, size_t len,
               (void *) &usa->sin6.sin6_addr, buf, len);
 #elif defined(_WIN32)
     /* Only Windows Vista (and newer) have inet_ntop() */
-    strncpy(buf, inet_ntoa(usa->sin.sin_addr), len);
+    mg_strlcpy(buf, inet_ntoa(usa->sin.sin_addr), len);
 #else
     inet_ntop(usa->sa.sa_family, (void *) &usa->sin.sin_addr, buf, len);
 #endif
@@ -773,7 +1062,7 @@ static void gmt_time_string(char *buf, size_t buf_len, time_t *t)
     if (tm != NULL) {
         strftime(buf, buf_len, "%a, %d %b %Y %H:%M:%S GMT", tm);
     } else {
-        strncpy(buf, "Thu, 01 Jan 1970 00:00:00 GMT", buf_len);
+        mg_strlcpy(buf, "Thu, 01 Jan 1970 00:00:00 GMT", buf_len);
         buf[buf_len - 1] = '\0';
     }
 }
@@ -838,115 +1127,6 @@ struct mg_request_info *mg_get_request_info(struct mg_connection *conn)
     return &conn->request_info;
 }
 
-static void mg_strlcpy(register char *dst, register const char *src, size_t n)
-{
-    for (; *src != '\0' && n > 1; n--) {
-        *dst++ = *src++;
-    }
-    *dst = '\0';
-}
-
-static int lowercase(const char *s)
-{
-    return tolower(* (const unsigned char *) s);
-}
-
-int mg_strncasecmp(const char *s1, const char *s2, size_t len)
-{
-    int diff = 0;
-
-    if (len > 0)
-        do {
-            diff = lowercase(s1++) - lowercase(s2++);
-        } while (diff == 0 && s1[-1] != '\0' && --len > 0);
-
-    return diff;
-}
-
-static int mg_strcasecmp(const char *s1, const char *s2)
-{
-    int diff;
-
-    do {
-        diff = lowercase(s1++) - lowercase(s2++);
-    } while (diff == 0 && s1[-1] != '\0');
-
-    return diff;
-}
-
-static char * mg_strndup(const char *ptr, size_t len)
-{
-    char *p;
-
-    if ((p = (char *) malloc(len + 1)) != NULL) {
-        mg_strlcpy(p, ptr, len + 1);
-    }
-
-    return p;
-}
-
-static char * mg_strdup(const char *str)
-{
-    return mg_strndup(str, strlen(str));
-}
-
-static const char *mg_strcasestr(const char *big_str, const char *small_str)
-{
-    int i, big_len = (int)strlen(big_str), small_len = (int)strlen(small_str);
-
-    for (i = 0; i <= big_len - small_len; i++) {
-        if (mg_strncasecmp(big_str + i, small_str, small_len) == 0) {
-            return big_str + i;
-        }
-    }
-
-    return NULL;
-}
-
-/* Like snprintf(), but never returns negative value, or a value
-   that is larger than a supplied buffer.
-   Thanks to Adam Zeldis to pointing snprintf()-caused vulnerability
-   in his audit report. */
-static int mg_vsnprintf(struct mg_connection *conn, char *buf, size_t buflen,
-                        const char *fmt, va_list ap)
-{
-    int n;
-
-    if (buflen == 0)
-        return 0;
-
-    n = vsnprintf(buf, buflen, fmt, ap);
-
-    if (n < 0) {
-        mg_cry(conn, "vsnprintf error");
-        n = 0;
-    } else if (n >= (int) buflen) {
-        mg_cry(conn, "truncating vsnprintf buffer: [%.*s]",
-               n > 200 ? 200 : n, buf);
-        n = (int) buflen - 1;
-    }
-    buf[n] = '\0';
-
-    return n;
-}
-
-static int mg_snprintf(struct mg_connection *conn, char *buf, size_t buflen,
-                       PRINTF_FORMAT_STRING(const char *fmt), ...)
-PRINTF_ARGS(4, 5);
-
-static int mg_snprintf(struct mg_connection *conn, char *buf, size_t buflen,
-                       const char *fmt, ...)
-{
-    va_list ap;
-    int n;
-
-    va_start(ap, fmt);
-    n = mg_vsnprintf(conn, buf, buflen, fmt, ap);
-    va_end(ap);
-
-    return n;
-}
-
 /* Skip the characters until one of the delimiters characters found.
    0-terminate resulting word. Skip the delimiter and following whitespaces.
    Advance pointer to buffer to the next word. Return found 0-terminated word.
@@ -1112,7 +1292,7 @@ static int should_keep_alive(const struct mg_connection *conn)
         conn->status_code == 401 ||
         mg_strcasecmp(conn->ctx->config[ENABLE_KEEP_ALIVE], "yes") != 0 ||
         (header != NULL && mg_strcasecmp(header, "keep-alive") != 0) ||
-        (header == NULL && http_version && strcmp(http_version, "1.1"))) {
+        (header == NULL && http_version && 0!=strcmp(http_version, "1.1"))) {
         return 0;
     }
     return 1;
@@ -1123,6 +1303,9 @@ static const char *suggest_connection_header(const struct mg_connection *conn)
     return should_keep_alive(conn) ? "keep-alive" : "close";
 }
 
+static void handle_file_based_request(struct mg_connection *conn, const char *path, struct file *filep);
+static int mg_stat(struct mg_connection *conn, const char *path, struct file *filep);
+
 static void send_http_error(struct mg_connection *, int, const char *,
                             PRINTF_FORMAT_STRING(const char *fmt), ...)
 PRINTF_ARGS(4, 5);
@@ -1133,21 +1316,66 @@ static void send_http_error(struct mg_connection *conn, int status,
 {
     char buf[MG_BUF_LEN];
     va_list ap;
-    int len = 0;
+    int len = 0, i, page_handler_found, scope;
     char date[64];
     time_t curtime = time(NULL);
+    const char *error_handler = NULL;
+    struct file error_page_file = STRUCT_FILE_INITIALIZER;
+    const char *error_page_file_ext, *tstr;
 
     conn->status_code = status;
-    if (conn->ctx->callbacks.http_error == NULL ||
+    if (conn->in_error_handler ||
+        conn->ctx->callbacks.http_error == NULL ||
         conn->ctx->callbacks.http_error(conn, status)) {
-        buf[0] = '\0';
 
+        if (!conn->in_error_handler) {
+            /* Send user defined error pages, if defined */
+            error_handler = conn->ctx->config[ERROR_PAGES];
+            error_page_file_ext = conn->ctx->config[INDEX_FILES];
+            page_handler_found = 0;
+            if (error_handler != NULL) {
+                for (scope=1; (scope<=3) && !page_handler_found; scope++) {
+                    switch (scope) {
+                    case 1:
+                        len = mg_snprintf(conn, buf, sizeof(buf)-32, "%serror%03u.", error_handler, status);
+                        break;
+                    case 2:
+                        len = mg_snprintf(conn, buf, sizeof(buf)-32, "%serror%01uxx.", error_handler, status/100);
+                        break;
+                    default:
+                        len = mg_snprintf(conn, buf, sizeof(buf)-32, "%serror.", error_handler);
+                        break;
+                    }
+                    tstr = strchr(error_page_file_ext, '.');
+                    while (tstr) {
+                        for (i=1; i<32 && tstr[i]!=0 && tstr[i]!=','; i++) buf[len+i-1]=tstr[i];
+                        buf[len+i-1]=0;
+                        if (mg_stat(conn, buf, &error_page_file)) {
+                            page_handler_found = 1;
+                            break;
+                        }
+                        tstr = strchr(tstr+i, '.');
+                    }
+                }
+            }
+
+            if (page_handler_found) {
+                conn->in_error_handler = 1;
+                handle_file_based_request(conn, buf, &error_page_file);
+                conn->in_error_handler = 0;
+                return;
+            }
+        }
+
+        buf[0] = '\0';
         gmt_time_string(date, sizeof(date), &curtime);
 
         /* Errors 1xx, 204 and 304 MUST NOT send a body */
         if (status > 199 && status != 204 && status != 304) {
-            len = mg_snprintf(conn, buf, sizeof(buf), "Error %d: %s", status, reason);
-            buf[len++] = '\n';
+            len = mg_snprintf(conn, buf, sizeof(buf)-1, "Error %d: %s", status, reason);
+            buf[len] = '\n';
+            len++;
+            buf[len] = 0;
 
             va_start(ap, fmt);
             len += mg_vsnprintf(conn, buf + len, sizeof(buf) - len, fmt, ap);
@@ -1188,6 +1416,7 @@ static int pthread_mutex_unlock(pthread_mutex_t *mutex)
     return ReleaseMutex(*mutex) == 0 ? -1 : 0;
 }
 
+#ifndef WIN_PTHREADS_TIME_H
 static int clock_gettime(clockid_t clk_id, struct timespec *tp)
 {
     FILETIME ft;
@@ -1206,11 +1435,11 @@ static int clock_gettime(clockid_t clk_id, struct timespec *tp)
             tp->tv_nsec = (long)(li.QuadPart % 10000000) * 100;
             ok = TRUE;
         } else if (clk_id == CLOCK_MONOTONIC) {
-            if (perfcnt_per_sec==0) {
+            if (perfcnt_per_sec == 0.0) {
                 QueryPerformanceFrequency((LARGE_INTEGER *) &li);
                 perfcnt_per_sec = 1.0 / li.QuadPart;
             }
-            if (perfcnt_per_sec!=0) {
+            if (perfcnt_per_sec != 0.0) {
                 QueryPerformanceCounter((LARGE_INTEGER *) &li);
                 d = li.QuadPart * perfcnt_per_sec;
                 tp->tv_sec = (time_t)d;
@@ -1223,13 +1452,14 @@ static int clock_gettime(clockid_t clk_id, struct timespec *tp)
 
     return ok ? 0 : -1;
 }
+#endif
 
 static int pthread_cond_init(pthread_cond_t *cv, const void *unused)
 {
     (void) unused;
     InitializeCriticalSection(&cv->threadIdSec);
     cv->waitingthreadcount = 0;
-    cv->waitingthreadhdls = calloc(MAX_WORKER_THREADS, sizeof(pthread_t));
+    cv->waitingthreadhdls = mg_calloc(MAX_WORKER_THREADS, sizeof(pthread_t));
     return (cv->waitingthreadhdls!=NULL) ? 0 : -1;
 }
 
@@ -1308,8 +1538,8 @@ static int pthread_cond_destroy(pthread_cond_t *cv)
 {
     EnterCriticalSection(&cv->threadIdSec);
     assert(cv->waitingthreadcount==0);
+    mg_free(cv->waitingthreadhdls);
     cv->waitingthreadhdls = 0;
-    free(cv->waitingthreadhdls);
     LeaveCriticalSection(&cv->threadIdSec);
     DeleteCriticalSection(&cv->threadIdSec);
 
@@ -1423,8 +1653,7 @@ static int path_cannot_disclose_cgi(const char *path)
     return isalnum(last) || strchr(allowed_last_characters, last) != NULL;
 }
 
-static int mg_stat(struct mg_connection *conn, const char *path,
-                   struct file *filep)
+static int mg_stat(struct mg_connection *conn, const char *path, struct file *filep)
 {
     wchar_t wbuf[PATH_MAX];
     WIN32_FILE_ATTRIBUTE_DATA info;
@@ -1480,7 +1709,7 @@ static DIR * opendir(const char *name)
 
     if (name == NULL) {
         SetLastError(ERROR_BAD_ARGUMENTS);
-    } else if ((dir = (DIR *) malloc(sizeof(*dir))) == NULL) {
+    } else if ((dir = (DIR *) mg_malloc(sizeof(*dir))) == NULL) {
         SetLastError(ERROR_NOT_ENOUGH_MEMORY);
     } else {
         to_unicode(name, wpath, ARRAY_SIZE(wpath));
@@ -1491,7 +1720,7 @@ static DIR * opendir(const char *name)
             dir->handle = FindFirstFileW(wpath, &dir->info);
             dir->result.d_name[0] = '\0';
         } else {
-            free(dir);
+            mg_free(dir);
             dir = NULL;
         }
     }
@@ -1507,7 +1736,7 @@ static int closedir(DIR *dir)
         if (dir->handle != INVALID_HANDLE_VALUE)
             result = FindClose(dir->handle) ? 0 : -1;
 
-        free(dir);
+        mg_free(dir);
     } else {
         result = -1;
         SetLastError(ERROR_BAD_ARGUMENTS);
@@ -1585,9 +1814,9 @@ int mg_start_thread(mg_thread_func_t f, void *p)
 {
 #if defined(USE_STACK_SIZE) && (USE_STACK_SIZE > 1)
     /* Compile-time option to control stack size, e.g. -DUSE_STACK_SIZE=16384 */
-    return (long)_beginthread((void (__cdecl *)(void *)) f, USE_STACK_SIZE, p) == -1L ? -1 : 0;
+    return ((_beginthread((void (__cdecl *)(void *)) f, USE_STACK_SIZE, p) == ((uintptr_t)(-1L))) ? -1 : 0);
 #else
-    return (long)_beginthread((void (__cdecl *)(void *)) f, 0, p) == -1L ? -1 : 0;
+    return ((_beginthread((void (__cdecl *)(void *)) f, 0, p) == ((uintptr_t)(-1L))) ? -1 : 0);
 #endif /* defined(USE_STACK_SIZE) && (USE_STACK_SIZE > 1) */
 }
 
@@ -1598,15 +1827,15 @@ static int mg_start_thread_with_id(unsigned (__stdcall *f)(void *), void *p,
 {
     uintptr_t uip;
     HANDLE threadhandle;
-    int result;
+    int result = -1;
 
     uip = _beginthreadex(NULL, 0, (unsigned (__stdcall *)(void *)) f, p, 0,
                          NULL);
     threadhandle = (HANDLE) uip;
-    if (threadidptr != NULL) {
+    if ((uip != (uintptr_t)(-1L)) && (threadidptr != NULL)) {
         *threadidptr = threadhandle;
+        result = 0;
     }
-    result = (threadhandle == NULL) ? -1 : 0;
 
     return result;
 }
@@ -1980,11 +2209,11 @@ static int pull_all(FILE *fp, struct mg_connection *conn, char *buf, int len)
 
 int mg_read(struct mg_connection *conn, void *buf, size_t len)
 {
-    int n, buffered_len, nread;
+    int64_t n, buffered_len, nread;
     const char *body;
 
-    /* If Content-Length is not set, read until socket is closed */
-    if (conn->consumed_content == 0 && conn->content_len == 0) {
+    /* If Content-Length is not set for a PUT or POST request, read until socket is closed */
+    if (conn->consumed_content == 0 && conn->content_len == -1) {
         conn->content_len = INT64_MAX;
         conn->must_close = 1;
     }
@@ -1999,10 +2228,10 @@ int mg_read(struct mg_connection *conn, void *buf, size_t len)
 
         /* Return buffered data */
         body = conn->buf + conn->request_len + conn->consumed_content;
-        buffered_len = (int)(&conn->buf[conn->data_len] - body);
+        buffered_len = (int64_t)(&conn->buf[conn->data_len] - body);
         if (buffered_len > 0) {
             if (len < (size_t) buffered_len) {
-                buffered_len = (int) len;
+                buffered_len = (int64_t) len;
             }
             memcpy(buf, body, (size_t) buffered_len);
             len -= buffered_len;
@@ -2013,7 +2242,7 @@ int mg_read(struct mg_connection *conn, void *buf, size_t len)
 
         /* We have returned all buffered data. Read new data from the remote
            socket. */
-        n = pull_all(NULL, conn, (char *) buf, (int) len);
+        n = pull_all(NULL, conn, (char *) buf, (int64_t) len);
         nread = n >= 0 ? nread + n : n;
     }
     return nread;
@@ -2067,8 +2296,8 @@ static int alloc_vprintf2(char **buf, const char *fmt, va_list ap)
 
     *buf = NULL;
     while (len == -1) {
-        if (*buf) free(*buf);
-        *buf = (char *)malloc(size *= 4);
+        if (*buf) mg_free(*buf);
+        *buf = (char *)mg_malloc(size *= 4);
         if (!*buf) break;
         va_copy(ap_copy, ap);
         len = vsnprintf(*buf, size, fmt, ap_copy);
@@ -2105,7 +2334,7 @@ static int alloc_vprintf(char **buf, size_t size, const char *fmt, va_list ap)
         va_end(ap_copy);
     } else if (len > (int) size &&
                (size = len + 1) > 0 &&
-               (*buf = (char *) malloc(size)) == NULL) {
+               (*buf = (char *) mg_malloc(size)) == NULL) {
         len = -1;  /* Allocation failed, mark failure */
     } else {
         va_copy(ap_copy, ap);
@@ -2127,7 +2356,7 @@ int mg_vprintf(struct mg_connection *conn, const char *fmt, va_list ap)
         len = mg_write(conn, buf, (size_t) len);
     }
     if (buf != mem && buf != NULL) {
-        free(buf);
+        mg_free(buf);
     }
 
     return len;
@@ -2266,6 +2495,75 @@ int mg_get_cookie(const char *cookie_header, const char *var_name,
     return len;
 }
 
+#if defined(USE_WEBSOCKET) || defined(USE_LUA)
+static void base64_encode(const unsigned char *src, int src_len, char *dst)
+{
+    static const char *b64 =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    int i, j, a, b, c;
+
+    for (i = j = 0; i < src_len; i += 3) {
+        a = src[i];
+        b = i + 1 >= src_len ? 0 : src[i + 1];
+        c = i + 2 >= src_len ? 0 : src[i + 2];
+
+        dst[j++] = b64[a >> 2];
+        dst[j++] = b64[((a & 3) << 4) | (b >> 4)];
+        if (i + 1 < src_len) {
+            dst[j++] = b64[(b & 15) << 2 | (c >> 6)];
+        }
+        if (i + 2 < src_len) {
+            dst[j++] = b64[c & 63];
+        }
+    }
+    while (j % 4 != 0) {
+        dst[j++] = '=';
+    }
+    dst[j++] = '\0';
+}
+
+static unsigned char b64reverse(char letter) {
+    if (letter>='A' && letter<='Z') return letter-'A';
+    if (letter>='a' && letter<='z') return letter-'a'+26;
+    if (letter>='0' && letter<='9') return letter-'0'+52;
+    if (letter=='+') return 62;
+    if (letter=='/') return 63;
+    if (letter=='=') return 255; /* normal end */
+    return 254; /* error */
+}
+
+static int base64_decode(const unsigned char *src, int src_len, char *dst, size_t *dst_len)
+{
+    int i;
+    unsigned char a, b, c, d;
+
+    *dst_len = 0;
+
+    for (i = 0; i < src_len; i += 4) {
+        a = b64reverse(src[i]);
+        if (a>=254) return i;
+
+        b = b64reverse(i + 1 >= src_len ? 0 : src[i + 1]);
+        if (b>=254) return i+1;
+
+        c = b64reverse(i + 2 >= src_len ? 0 : src[i + 2]);
+        if (c==254) return i+2;
+
+        d = b64reverse(i + 3 >= src_len ? 0 : src[i + 3]);
+        if (c==254) return i+3;
+
+        dst[(*dst_len)++] = (a << 2) + (b >> 4);
+        if (c!=255) {
+            dst[(*dst_len)++] = (b << 4) + (c >> 2);
+            if (d!=255) {
+                dst[(*dst_len)++] = (c << 6) + d;
+            }
+        }
+    }
+    return -1;
+}
+#endif
+
 static void convert_uri_to_file_name(struct mg_connection *conn, char *buf,
                                      size_t buf_len, struct file *filep,
                                      int * is_script_ressource)
@@ -2400,25 +2698,27 @@ static time_t parse_date_string(const char *datetime)
     static const unsigned short days_before_month[] = {
         0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
     };
-    char month_str[32];
+    char month_str[32]={0};
     int second, minute, hour, day, month, year, leap_days, days;
     time_t result = (time_t) 0;
 
-    if (((sscanf(datetime, "%d/%3s/%d %d:%d:%d",
-                 &day, month_str, &year, &hour, &minute, &second) == 6) ||
-         (sscanf(datetime, "%d %3s %d %d:%d:%d",
-                 &day, month_str, &year, &hour, &minute, &second) == 6) ||
-         (sscanf(datetime, "%*3s, %d %3s %d %d:%d:%d",
-                 &day, month_str, &year, &hour, &minute, &second) == 6) ||
-         (sscanf(datetime, "%d-%3s-%d %d:%d:%d",
-                 &day, month_str, &year, &hour, &minute, &second) == 6)) &&
-        year > 1970 &&
-        (month = get_month_index(month_str)) != -1) {
-        leap_days = num_leap_years(year) - num_leap_years(1970);
-        year -= 1970;
-        days = year * 365 + days_before_month[month] + (day - 1) + leap_days;
-        result = (time_t) days * 24 * 3600 + (time_t) hour * 3600 +
-                 minute * 60 + second;
+    if ((sscanf(datetime, "%d/%3s/%d %d:%d:%d",
+                &day, month_str, &year, &hour, &minute, &second) == 6) ||
+        (sscanf(datetime, "%d %3s %d %d:%d:%d",
+                &day, month_str, &year, &hour, &minute, &second) == 6) ||
+        (sscanf(datetime, "%*3s, %d %3s %d %d:%d:%d",
+                &day, month_str, &year, &hour, &minute, &second) == 6) ||
+        (sscanf(datetime, "%d-%3s-%d %d:%d:%d",
+                &day, month_str, &year, &hour, &minute, &second) == 6)) {
+
+        month = get_month_index(month_str);
+        if ((month >= 0) && (year > 1970)) {
+            leap_days = num_leap_years(year) - num_leap_years(1970);
+            year -= 1970;
+            days = year * 365 + days_before_month[month] + (day - 1) + leap_days;
+            result = (time_t) days * 24 * 3600 + (time_t) hour * 3600 +
+                     minute * 60 + second;
+        }
     }
 
     return result;
@@ -2700,6 +3000,7 @@ static int parse_auth_header(struct mg_connection *conn, char *buf,
 {
     char *name, *value, *s;
     const char *auth_header;
+    unsigned long nonce;
 
     (void) memset(ah, 0, sizeof(*ah));
     if ((auth_header = mg_get_header(conn, "Authorization")) == NULL ||
@@ -2750,6 +3051,27 @@ static int parse_auth_header(struct mg_connection *conn, char *buf,
         }
     }
 
+#ifndef NO_NONCE_CHECK
+    /* Convert the nonce from the client to a number and check it. */
+    /* Server side nonce check is valuable in all situations but one: if the server restarts frequently,
+       but the client should not see that, so the server should accept nonces from previous starts. */
+    if (ah->nonce == NULL) {
+        return 0;
+    }
+    nonce = strtoul(ah->nonce, &s, 10);
+    if ((s == NULL) || (*s != 0)) {
+        return 0;
+    }
+    nonce ^= (unsigned long)(conn->ctx);
+    if (nonce<conn->ctx->start_time) {
+        /* nonce is from a previous start of the server and no longer valid (replay attack?) */
+        return 0;
+    }
+    if (nonce>=conn->ctx->start_time+conn->ctx->nonce_count) {
+        return 0;
+    }
+#endif
+
     /* CGI needs it as REMOTE_USER */
     if (ah->user != NULL) {
         conn->request_info.remote_user = mg_strdup(ah->user);
@@ -2768,9 +3090,7 @@ static char *mg_fgets(char *buf, size_t size, struct file *filep, char **p)
 
     if (filep->membuf != NULL && *p != NULL) {
         memend = (char *) &filep->membuf[filep->size];
-        eof = (char *) memchr(*p, '\n', memend - *p); /* Search for \n from p
-                                                         till the end of
-                                                         stream */
+        eof = (char *) memchr(*p, '\n', memend - *p); /* Search for \n from p till the end of stream */
         if (eof != NULL) {
             eof += 1; /* Include \n */
         } else {
@@ -2801,9 +3121,12 @@ static int authorize(struct mg_connection *conn, struct file *filep)
     /* Loop over passwords file */
     p = (char *) filep->membuf;
     while (mg_fgets(line, sizeof(line), filep, &p) != NULL) {
-        if (sscanf(line, "%[^:]:%[^:]:%s", f_user, f_domain, ha1) != 3) {
+        if (sscanf(line, "%255[^:]:%255[^:]:%255s", f_user, f_domain, ha1) != 3) {
             continue;
         }
+        f_user[255]=0;
+        f_domain[255]=0;
+        ha1[255]=0;
 
         if (!strcmp(ah.user, f_user) &&
             !strcmp(conn->ctx->config[AUTHENTICATION_DOMAIN], f_domain))
@@ -2851,7 +3174,14 @@ static void send_authorization_request(struct mg_connection *conn)
 {
     char date[64];
     time_t curtime = time(NULL);
+    unsigned long nonce = (unsigned long)(conn->ctx->start_time);
 
+    (void)pthread_mutex_lock(&conn->ctx->mutex);
+    nonce += conn->ctx->nonce_count;
+    ++conn->ctx->nonce_count;
+    (void)pthread_mutex_unlock(&conn->ctx->mutex);
+
+    nonce ^= (unsigned long)(conn->ctx);
     conn->status_code = 401;
     conn->must_close = 1;
 
@@ -2865,7 +3195,7 @@ static void send_authorization_request(struct mg_connection *conn)
               "WWW-Authenticate: Digest qop=\"auth\", realm=\"%s\", nonce=\"%lu\"\r\n\r\n",
               date, suggest_connection_header(conn),
               conn->ctx->config[AUTHENTICATION_DOMAIN],
-              (unsigned long) time(NULL));
+              nonce);
 }
 
 static int is_authorized_for_put(struct mg_connection *conn)
@@ -2915,9 +3245,11 @@ int mg_modify_passwords_file(const char *fname, const char *domain,
 
     /* Copy the stuff to temporary file */
     while (fgets(line, sizeof(line), fp) != NULL) {
-        if (sscanf(line, "%[^:]:%[^:]:%*s", u, d) != 2) {
+        if (sscanf(line, "%255[^:]:%255[^:]:%*s", u, d) != 2) {
             continue;
         }
+        u[255]=0;
+        d[255]=0;
 
         if (!strcmp(u, user) && !strcmp(d, domain)) {
             found++;
@@ -3031,7 +3363,7 @@ static void print_dir_entry(struct de *de)
     if (tm != NULL) {
         strftime(mod, sizeof(mod), "%d-%b-%Y %H:%M", tm);
     } else {
-        strncpy(mod, "01-Jan-1970 00:00", sizeof(mod));
+        mg_strlcpy(mod, "01-Jan-1970 00:00", sizeof(mod));
         mod[sizeof(mod) - 1] = '\0';
     }
     mg_url_encode(de->file_name, href, sizeof(href));
@@ -3181,9 +3513,9 @@ struct dir_scan_data {
 /* Behaves like realloc(), but frees original pointer on failure */
 static void *realloc2(void *ptr, size_t size)
 {
-    void *new_ptr = realloc(ptr, size);
+    void *new_ptr = mg_realloc(ptr, size);
     if (new_ptr == NULL) {
-        free(ptr);
+        mg_free(ptr);
     }
     return new_ptr;
 }
@@ -3257,9 +3589,9 @@ static void handle_directory_request(struct mg_connection *conn,
               sizeof(data.entries[0]), compare_dir_entries);
         for (i = 0; i < data.num_entries; i++) {
             print_dir_entry(&data.entries[i]);
-            free(data.entries[i].file_name);
+            mg_free(data.entries[i].file_name);
         }
-        free(data.entries);
+        mg_free(data.entries);
     }
 
     conn->num_bytes_sent += mg_printf(conn, "%s", "</table></body></html>");
@@ -3336,8 +3668,7 @@ static void fclose_on_exec(struct file *filep, struct mg_connection *conn)
     }
 }
 
-static void handle_file_request(struct mg_connection *conn, const char *path,
-                                struct file *filep)
+static void handle_static_file_request(struct mg_connection *conn, const char *path, struct file *filep)
 {
     char date[64], lm[64], etag[64], range[64];
     const char *msg = "OK", *hdr;
@@ -3346,7 +3677,8 @@ static void handle_file_request(struct mg_connection *conn, const char *path,
     struct vec mime_vec;
     int n;
     char gz_path[PATH_MAX];
-    char const* encoding = "";
+    const char *encoding = "";
+    const char *cors1, *cors2, *cors3;
 
     get_mime_type(conn->ctx, path, &mime_vec);
     cl = filep->size;
@@ -3383,7 +3715,7 @@ static void handle_file_request(struct mg_connection *conn, const char *path,
             return;
         }
         conn->status_code = 206;
-cl = n == 2 ? (r2 > cl ? cl : r2) - r1 + 1: cl - r1;
+        cl = n == 2 ? (r2 > cl ? cl : r2) - r1 + 1: cl - r1;
         mg_snprintf(conn, range, sizeof(range),
                     "Content-Range: bytes "
                     "%" INT64_FMT "-%"
@@ -3392,6 +3724,17 @@ cl = n == 2 ? (r2 > cl ? cl : r2) - r1 + 1: cl - r1;
         msg = "Partial Content";
     }
 
+    hdr = mg_get_header(conn, "Origin");
+    if (hdr) {
+        /* Cross-origin resource sharing (CORS), see http://www.html5rocks.com/en/tutorials/cors/,
+           http://www.html5rocks.com/static/images/cors_server_flowchart.png - preflight is not supported for files. */
+        cors1 = "Access-Control-Allow-Origin: ";
+        cors2 = conn->ctx->config[ACCESS_CONTROL_ALLOW_ORIGIN];
+        cors3 = "\r\n";
+    } else {
+        cors1 = cors2 = cors3 = "";
+    }
+
     /* Prepare Etag, Date, Last-Modified headers. Must be in UTC, according to
        http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3 */
     gmt_time_string(date, sizeof(date), &curtime);
@@ -3400,6 +3743,7 @@ cl = n == 2 ? (r2 > cl ? cl : r2) - r1 + 1: cl - r1;
 
     (void) mg_printf(conn,
                      "HTTP/1.1 %d %s\r\n"
+                     "%s%s%s"
                      "Date: %s\r\n"
                      "Last-Modified: %s\r\n"
                      "Etag: %s\r\n"
@@ -3408,7 +3752,9 @@ cl = n == 2 ? (r2 > cl ? cl : r2) - r1 + 1: cl - r1;
                      "Connection: %s\r\n"
                      "Accept-Ranges: bytes\r\n"
                      "%s%s\r\n",
-                     conn->status_code, msg, date, lm, etag, (int) mime_vec.len,
+                     conn->status_code, msg,
+                     cors1, cors2, cors3,
+                     date, lm, etag, (int) mime_vec.len,
                      mime_vec.ptr, cl, suggest_connection_header(conn), range, encoding);
 
     if (strcmp(conn->request_info.request_method, "HEAD") != 0) {
@@ -3421,7 +3767,7 @@ void mg_send_file(struct mg_connection *conn, const char *path)
 {
     struct file file = STRUCT_FILE_INITIALIZER;
     if (mg_stat(conn, path, &file)) {
-        handle_file_request(conn, path, &file);
+        handle_static_file_request(conn, path, &file);
     } else {
         send_http_error(conn, 404, "Not Found", "%s", "File not found");
     }
@@ -3744,6 +4090,11 @@ static void prepare_cgi_environment(struct mg_connection *conn,
         addenv(blk, "PATH_INFO=%s", conn->path_info);
     }
 
+    if (conn->status_code > 0) {
+        /* CGI error handler should show the status code */
+        addenv(blk, "STATUS=%d", conn->status_code);
+    }
+
 #if defined(_WIN32)
     if ((s = getenv("COMSPEC")) != NULL) {
         addenv(blk, "COMSPEC=%s", s);
@@ -3884,7 +4235,7 @@ static void handle_cgi_request(struct mg_connection *conn, const char *prog)
        Do not send anything back to client, until we buffer in all
        HTTP headers. */
     data_len = 0;
-    buf = malloc(buflen);
+    buf = mg_malloc(buflen);
     if (buf == NULL) {
         send_http_error(conn, 500, http_500_error,
                         "Not enough memory for buffer (%u bytes)",
@@ -3967,7 +4318,7 @@ done:
         close(fdout[0]);
     }
     if (buf != NULL) {
-        free(buf);
+        mg_free(buf);
     }
 }
 #endif /* !NO_CGI */
@@ -4102,29 +4453,33 @@ static void send_ssi_file(struct mg_connection *, const char *,
 static void do_ssi_include(struct mg_connection *conn, const char *ssi,
                            char *tag, int include_level)
 {
-    char file_name[MG_BUF_LEN], path[PATH_MAX], *p;
+    char file_name[MG_BUF_LEN], path[512], *p;
     struct file file = STRUCT_FILE_INITIALIZER;
+    size_t len;
 
     /* sscanf() is safe here, since send_ssi_file() also uses buffer
        of size MG_BUF_LEN to get the tag. So strlen(tag) is
        always < MG_BUF_LEN. */
-    if (sscanf(tag, " virtual=\"%[^\"]\"", file_name) == 1) {
+    if (sscanf(tag, " virtual=\"%511[^\"]\"", file_name) == 1) {
         /* File name is relative to the webserver root */
+        file_name[511]=0;
         (void) mg_snprintf(conn, path, sizeof(path), "%s%c%s",
                            conn->ctx->config[DOCUMENT_ROOT], '/', file_name);
-    } else if (sscanf(tag, " abspath=\"%[^\"]\"", file_name) == 1) {
+    } else if (sscanf(tag, " abspath=\"%511[^\"]\"", file_name) == 1) {
         /* File name is relative to the webserver working directory
            or it is absolute system path */
+        file_name[511]=0;
         (void) mg_snprintf(conn, path, sizeof(path), "%s", file_name);
-    } else if (sscanf(tag, " file=\"%[^\"]\"", file_name) == 1 ||
-               sscanf(tag, " \"%[^\"]\"", file_name) == 1) {
+    } else if (sscanf(tag, " file=\"%511[^\"]\"", file_name) == 1 ||
+               sscanf(tag, " \"%511[^\"]\"", file_name) == 1) {
         /* File name is relative to the currect document */
+        file_name[511]=0;
         (void) mg_snprintf(conn, path, sizeof(path), "%s", ssi);
         if ((p = strrchr(path, '/')) != NULL) {
             p[1] = '\0';
         }
-        (void) mg_snprintf(conn, path + strlen(path),
-                           sizeof(path) - strlen(path), "%s", file_name);
+        len = strlen(path);
+        (void) mg_snprintf(conn, path + len, sizeof(path) - len, "%s", file_name);
     } else {
         mg_cry(conn, "Bad SSI #include: [%s]", tag);
         return;
@@ -4148,16 +4503,19 @@ static void do_ssi_include(struct mg_connection *conn, const char *ssi,
 #if !defined(NO_POPEN)
 static void do_ssi_exec(struct mg_connection *conn, char *tag)
 {
-    char cmd[MG_BUF_LEN] = "";
+    char cmd[1024] = "";
     struct file file = STRUCT_FILE_INITIALIZER;
 
-    if (sscanf(tag, " \"%[^\"]\"", cmd) != 1) {
+    if (sscanf(tag, " \"%1023[^\"]\"", cmd) != 1) {
         mg_cry(conn, "Bad SSI #exec: [%s]", tag);
-    } else if ((file.fp = popen(cmd, "r")) == NULL) {
-        mg_cry(conn, "Cannot SSI #exec: [%s]: %s", cmd, strerror(ERRNO));
     } else {
-        send_file_data(conn, &file, 0, INT64_MAX);
-        pclose(file.fp);
+        cmd[1023]=0;
+        if ((file.fp = popen(cmd, "r")) == NULL) {
+            mg_cry(conn, "Cannot SSI #exec: [%s]: %s", cmd, strerror(ERRNO));
+        } else {
+            send_file_data(conn, &file, 0, INT64_MAX);
+            pclose(file.fp);
+        }
     }
 }
 #endif /* !NO_POPEN */
@@ -4243,6 +4601,16 @@ static void handle_ssi_file_request(struct mg_connection *conn,
     struct file file = STRUCT_FILE_INITIALIZER;
     char date[64];
     time_t curtime = time(NULL);
+    const char *cors1, *cors2, *cors3;
+
+    if (mg_get_header(conn, "Origin")) {
+        /* Cross-origin resource sharing (CORS). */
+        cors1 = "Access-Control-Allow-Origin: ";
+        cors2 = conn->ctx->config[ACCESS_CONTROL_ALLOW_ORIGIN];
+        cors3 = "\r\n";
+    } else {
+        cors1 = cors2 = cors3 = "";
+    }
 
     if (!mg_fopen(conn, path, "rb", &file)) {
         send_http_error(conn, 500, http_500_error, "fopen(%s): %s", path,
@@ -4252,10 +4620,12 @@ static void handle_ssi_file_request(struct mg_connection *conn,
         gmt_time_string(date, sizeof(date), &curtime);
         fclose_on_exec(&file, conn);
         mg_printf(conn, "HTTP/1.1 200 OK\r\n"
+                        "%s%s%s"
                         "Date: %s\r\n"
                         "Content-Type: text/html\r\n"
                         "Connection: %s\r\n\r\n",
-                  date, suggest_connection_header(conn));
+                        cors1, cors2, cors3,
+                        date, suggest_connection_header(conn));
         send_ssi_file(conn, path, &file, 0);
         mg_fclose(&file);
     }
@@ -4563,32 +4933,6 @@ static void SHA1Final(unsigned char digest[20], SHA1_CTX* context)
 }
 /* END OF SHA1 CODE */
 
-static void base64_encode(const unsigned char *src, int src_len, char *dst)
-{
-    static const char *b64 =
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-    int i, j, a, b, c;
-
-    for (i = j = 0; i < src_len; i += 3) {
-        a = src[i];
-        b = i + 1 >= src_len ? 0 : src[i + 1];
-        c = i + 2 >= src_len ? 0 : src[i + 2];
-
-        dst[j++] = b64[a >> 2];
-        dst[j++] = b64[((a & 3) << 4) | (b >> 4)];
-        if (i + 1 < src_len) {
-            dst[j++] = b64[(b & 15) << 2 | (c >> 6)];
-        }
-        if (i + 2 < src_len) {
-            dst[j++] = b64[c & 63];
-        }
-    }
-    while (j % 4 != 0) {
-        dst[j++] = '=';
-    }
-    dst[j++] = '\0';
-}
-
 static void send_websocket_handshake(struct mg_connection *conn)
 {
     static const char *magic = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11";
@@ -4656,15 +5000,15 @@ static void read_websocket(struct mg_connection *conn)
             }
         }
 
-        if (header_len > 0) {
+        if (header_len > 0 && body_len >= header_len) {
             /* Allocate space to hold websocket payload */
             data = mem;
             if (data_len > sizeof(mem)) {
-                data = (char *)malloc(data_len);
+                data = (char *)mg_malloc(data_len);
                 if (data == NULL) {
                     /* Allocation failed, exit the loop and then close the
                        connection */
-                    mg_cry(conn, "websocket malloc() failed; closing connection");
+                    mg_cry(conn, "websocket out of memory; closing connection");
                     break;
                 }
             }
@@ -4687,7 +5031,7 @@ static void read_websocket(struct mg_connection *conn)
                 error = 0;
                 while (len < data_len) {
                     int n = pull(NULL, conn, data + len, (int)(data_len - len));
-                    if (n < 0) {
+                    if (n <= 0) {
                         error = 1;
                         break;
                     }
@@ -4738,7 +5082,7 @@ static void read_websocket(struct mg_connection *conn)
             }
 
             if (data != mem) {
-                free(data);
+                mg_free(data);
             }
             /* Not breaking the loop, process next websocket frame. */
         } else {
@@ -4797,6 +5141,12 @@ int mg_websocket_write(struct mg_connection* conn, int opcode, const char* data,
 static void handle_websocket_request(struct mg_connection *conn, const char *path, int is_script_resource)
 {
     const char *version = mg_get_header(conn, "Sec-WebSocket-Version");
+#ifdef USE_LUA
+    int lua_websock, shared_lua_websock = 0;
+    /* TODO: A websocket script may be shared between several clients, allowing them to communicate
+             directly instead of writing to a data base and polling the data base. */
+#endif
+
     if (version == NULL || strcmp(version, "13") != 0) {
         send_http_error(conn, 426, "Upgrade Required", "%s", "Upgrade Required");
     } else if (conn->ctx->callbacks.websocket_connect != NULL &&
@@ -4805,10 +5155,14 @@ static void handle_websocket_request(struct mg_connection *conn, const char *pat
         /* The C callback is called before Lua and may prevent Lua from handling the websocket. */
     } else {
 #ifdef USE_LUA
-        if (match_prefix(conn->ctx->config[LUA_SCRIPT_EXTENSIONS],
-                         (int)strlen(conn->ctx->config[LUA_SCRIPT_EXTENSIONS]),
-                         path)) {
-            conn->lua_websocket_state = new_lua_websocket(path, conn);
+        lua_websock = conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS] ?
+                          match_prefix(conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS],
+                                       (int)strlen(conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS]),
+                                       path) : 0;
+
+        if (lua_websock || shared_lua_websock) {
+            /* TODO */ shared_lua_websock = 0;
+            conn->lua_websocket_state = lua_websocket_new(path, conn, !!shared_lua_websock);
             if (conn->lua_websocket_state) {
                 send_websocket_handshake(conn);
                 if (lua_websocket_ready(conn)) {
@@ -4933,6 +5287,7 @@ int mg_upload(struct mg_connection *conn, const char *destination_dir)
         return num_uploaded_files;
     }
 
+    boundary[99]=0;
     boundary_len = (int)strlen(boundary);
     bl = boundary_len + 4;  /* \r\n--<boundary> */
     for (;;) {
@@ -4954,6 +5309,7 @@ int mg_upload(struct mg_connection *conn, const char *destination_dir)
                    parse the header properly instead. */
                 IGNORE_UNUSED_RESULT(sscanf(&buf[j], "Content-Disposition: %*s %*s filename=\"%1023[^\"]",
                                             fname));
+                fname[1023]=0;
                 j = i + 2;
             }
         }
@@ -5046,7 +5402,7 @@ static void redirect_to_https_port(struct mg_connection *conn, int ssl_index)
     if (host_header != NULL) {
         char *pos;
 
-        strncpy(host, host_header, hostlen);
+        mg_strlcpy(host, host_header, hostlen);
         host[hostlen - 1] = '\0';
         pos = strchr(host, ':');
         if (pos != NULL) {
@@ -5071,7 +5427,7 @@ void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_
 
     /* first see it the uri exists */
     for (tmp_rh = ctx->request_handlers;
-         tmp_rh != NULL && strcmp(uri, tmp_rh->uri);
+         tmp_rh != NULL && 0!=strcmp(uri, tmp_rh->uri);
          lastref = tmp_rh, tmp_rh = tmp_rh->next) {
         /* first try for an exact match */
         if (urilen == tmp_rh->uri_len && !strcmp(tmp_rh->uri,uri)) {
@@ -5087,8 +5443,8 @@ void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_
                     lastref->next = tmp_rh->next;
                 else
                     ctx->request_handlers = tmp_rh->next;
-                free(tmp_rh->uri);
-                free(tmp_rh);
+                mg_free(tmp_rh->uri);
+                mg_free(tmp_rh);
             }
             return;
         }
@@ -5109,13 +5465,13 @@ void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_
         return;
     }
 
-    tmp_rh = (struct mg_request_handler_info *)malloc(sizeof(struct mg_request_handler_info));
+    tmp_rh = (struct mg_request_handler_info *)mg_malloc(sizeof(struct mg_request_handler_info));
     if (tmp_rh == NULL) {
         mg_cry(fc(ctx), "%s", "Cannot create new request handler struct, OOM");
         return;
     }
     tmp_rh->uri = mg_strdup(uri);
-    tmp_rh->uri_len = strlen(uri);
+    tmp_rh->uri_len = urilen;
     tmp_rh->handler = handler;
     tmp_rh->cbdata = cbdata;
 
@@ -5151,6 +5507,11 @@ static int use_request_handler(struct mg_connection *conn)
 
             return tmp_rh->handler(conn, tmp_rh->cbdata);
         }
+
+        /* try for pattern match */
+        if (match_prefix(tmp_rh->uri, tmp_rh->uri_len, uri) > 0) {
+           return tmp_rh->handler(conn, tmp_rh->cbdata);
+        }
 
     }
 
@@ -5202,6 +5563,8 @@ static void handle_request(struct mg_connection *conn)
                use_request_handler(conn)) {
         /* Do nothing, callback has served the request */
     } else if (!is_script_resource && !strcmp(ri->request_method, "OPTIONS")) {
+        /* Scripts should support the OPTIONS method themselves, to allow a maximum flexibility.
+           Lua and CGI scripts may fully support CORS this way (including preflights). */
         send_options(conn);
     } else if (conn->ctx->config[DOCUMENT_ROOT] == NULL) {
         send_http_error(conn, 404, "Not Found", "Not Found");
@@ -5254,6 +5617,14 @@ static void handle_request(struct mg_connection *conn)
             send_http_error(conn, 403, "Directory Listing Denied",
                             "Directory listing denied");
         }
+    } else {
+        handle_file_based_request(conn, path, &file);
+    }
+}
+
+static void handle_file_based_request(struct mg_connection *conn, const char *path, struct file *file)
+{
+    if (0) {
 #ifdef USE_LUA
     } else if (match_prefix(conn->ctx->config[LUA_SERVER_PAGE_EXTENSIONS],
                             (int)strlen(conn->ctx->config[LUA_SERVER_PAGE_EXTENSIONS]),
@@ -5270,25 +5641,17 @@ static void handle_request(struct mg_connection *conn)
     } else if (match_prefix(conn->ctx->config[CGI_EXTENSIONS],
                             (int)strlen(conn->ctx->config[CGI_EXTENSIONS]),
                             path) > 0) {
-        /* TODO: check unsupported methods -> 501
-        if (strcmp(ri->request_method, "POST") &&
-            strcmp(ri->request_method, "HEAD") &&
-            strcmp(ri->request_method, "GET")) {
-            send_http_error(conn, 501, "Not Implemented",
-                            "Method %s is not implemented", ri->request_method);
-        } else {
-            handle_cgi_request(conn, path);
-        } */
+        /* CGI scripts may support all HTTP methods */
         handle_cgi_request(conn, path);
 #endif /* !NO_CGI */
     } else if (match_prefix(conn->ctx->config[SSI_EXTENSIONS],
                             (int)strlen(conn->ctx->config[SSI_EXTENSIONS]),
                             path) > 0) {
         handle_ssi_file_request(conn, path);
-    } else if (is_not_modified(conn, &file)) {
+    } else if ((!conn->in_error_handler) && is_not_modified(conn, file)) {
         send_http_error(conn, 304, "Not Modified", "%s", "");
     } else {
-        handle_file_request(conn, path, &file);
+        handle_static_file_request(conn, path, file);
     }
 }
 
@@ -5297,8 +5660,12 @@ static void close_all_listening_sockets(struct mg_context *ctx)
     int i;
     for (i = 0; i < ctx->num_listening_sockets; i++) {
         closesocket(ctx->listening_sockets[i].sock);
+        ctx->listening_sockets[i].sock = INVALID_SOCKET;
     }
-    free(ctx->listening_sockets);
+    mg_free(ctx->listening_sockets);
+    ctx->listening_sockets=0;
+    mg_free(ctx->listening_ports);
+    ctx->listening_ports=0;
 }
 
 static int is_valid_port(unsigned int port)
@@ -5311,9 +5678,10 @@ static int is_valid_port(unsigned int port)
    TODO(lsm): add parsing of the IPv6 address */
 static int parse_port_string(const struct vec *vec, struct socket *so)
 {
-    unsigned int a, b, c, d, ch, len, port;
+    unsigned int a, b, c, d, port;
+    int  ch, len;
 #if defined(USE_IPV6)
-    char buf[100];
+    char buf[100]={0};
 #endif
 
     /* MacOS needs that. If we do not zero it, subsequent bind() will fail.
@@ -5327,7 +5695,6 @@ static int parse_port_string(const struct vec *vec, struct socket *so)
         so->lsa.sin.sin_addr.s_addr = htonl((a << 24) | (b << 16) | (c << 8) | d);
         so->lsa.sin.sin_port = htons((uint16_t) port);
 #if defined(USE_IPV6)
-
     } else if (sscanf(vec->ptr, "[%49[^]]]:%d%n", buf, &port, &len) == 2 &&
                inet_pton(AF_INET6, buf, &so->lsa.sin6.sin6_addr)) {
         /* IPv6 address, e.g. [3ffe:2a00:100:7031::1]:8080 */
@@ -5341,6 +5708,7 @@ static int parse_port_string(const struct vec *vec, struct socket *so)
         port = len = 0;   /* Parsing failure. Make port invalid. */
     }
 
+    assert((len>=0) && ((unsigned)len<=(unsigned)vec->len)); /* sscanf and the option splitting code ensure this condition */
     ch = vec->ptr[len];  /* Next character after the port number */
     so->is_ssl = ch == 's';
     so->ssl_redir = ch == 'r';
@@ -5361,11 +5729,11 @@ static int set_ports_option(struct mg_context *ctx)
     struct socket so, *ptr;
 
     in_port_t *portPtr;
-    struct sockaddr_in sin;
+    union usa usa;
     socklen_t len;
 
-    memset(&sin, 0, sizeof(sin));
-    len = sizeof(sin);
+    memset(&usa, 0, sizeof(usa));
+    len = sizeof(usa);
 
     while (success && (list = next_option(list, &vec, NULL)) != NULL) {
         if (!parse_port_string(&vec, &so)) {
@@ -5389,22 +5757,25 @@ static int set_ports_option(struct mg_context *ctx)
                    bind(so.sock, &so.lsa.sa, so.lsa.sa.sa_family == AF_INET ?
                         sizeof(so.lsa.sin) : sizeof(so.lsa)) != 0 ||
                    listen(so.sock, SOMAXCONN) != 0 ||
-                   getsockname(so.sock, (struct sockaddr *)&sin, &len) != 0) {
+                   getsockname(so.sock, &(usa.sa), &len) != 0) {
             mg_cry(fc(ctx), "%s: cannot bind to %.*s: %d (%s)", __func__,
                    (int) vec.len, vec.ptr, ERRNO, strerror(errno));
             if (so.sock != INVALID_SOCKET) {
                 closesocket(so.sock);
+                so.sock = INVALID_SOCKET;
             }
             success = 0;
-        } else if ((ptr = (struct socket *) realloc(ctx->listening_sockets,
+        } else if ((ptr = (struct socket *) mg_realloc(ctx->listening_sockets,
                           (ctx->num_listening_sockets + 1) *
                           sizeof(ctx->listening_sockets[0]))) == NULL) {
             closesocket(so.sock);
+            so.sock = INVALID_SOCKET;
             success = 0;
-        } else if ((portPtr = (in_port_t*) realloc(ctx->listening_ports,
+        } else if ((portPtr = (in_port_t*) mg_realloc(ctx->listening_ports,
                           (ctx->num_listening_sockets + 1) *
                           sizeof(ctx->listening_ports[0]))) == NULL) {
             closesocket(so.sock);
+            so.sock = INVALID_SOCKET;
             success = 0;
         }
         else {
@@ -5412,7 +5783,7 @@ static int set_ports_option(struct mg_context *ctx)
             ctx->listening_sockets = ptr;
             ctx->listening_sockets[ctx->num_listening_sockets] = so;
             ctx->listening_ports = portPtr;
-            ctx->listening_ports[ctx->num_listening_sockets] = ntohs(sin.sin_port);
+            ctx->listening_ports[ctx->num_listening_sockets] = ntohs(usa.sin.sin_port);
             ctx->num_listening_sockets++;
         }
     }
@@ -5453,7 +5824,7 @@ static void log_access(const struct mg_connection *conn)
     if (tm != NULL) {
         strftime(date, sizeof(date), "%d/%b/%Y:%H:%M:%S %z", tm);
     } else {
-        strncpy(date, "01/Jan/1970:00:00:00 +0000", sizeof(date));
+        mg_strlcpy(date, "01/Jan/1970:00:00:00 +0000", sizeof(date));
         date[sizeof(date) - 1] = '\0';
     }
 
@@ -5650,7 +6021,7 @@ static int set_ssl_option(struct mg_context *ctx)
     /* Initialize locking callbacks, needed for thread safety.
        http://www.openssl.org/support/faq.html#PROG1 */
     size = sizeof(pthread_mutex_t) * CRYPTO_num_locks();
-    if ((ssl_mutexes = (pthread_mutex_t *) malloc((size_t)size)) == NULL) {
+    if ((ssl_mutexes = (pthread_mutex_t *) mg_malloc((size_t)size)) == NULL) {
         mg_cry(fc(ctx), "%s: cannot allocate mutexes: %s", __func__, ssl_error());
         return 0;
     }
@@ -5738,6 +6109,7 @@ static void close_socket_gracefully(struct mg_connection *conn)
 
     /* Now we know that our FIN is ACK-ed, safe to close */
     closesocket(conn->client.sock);
+    conn->client.sock = INVALID_SOCKET;
 }
 
 static void close_connection(struct mg_connection *conn)
@@ -5781,13 +6153,10 @@ void mg_close_connection(struct mg_connection *conn)
 #endif
     close_connection(conn);
     (void) pthread_mutex_destroy(&conn->mutex);
-    free(conn);
+    mg_free(conn);
 }
 
 struct mg_connection *mg_connect(const char *host, int port, int use_ssl,
-                                 char *ebuf, size_t ebuf_len);
-
-struct mg_connection *mg_connect(const char *host, int port, int use_ssl,
                                  char *ebuf, size_t ebuf_len)
 {
     static struct mg_context fake_ctx;
@@ -5797,15 +6166,17 @@ struct mg_connection *mg_connect(const char *host, int port, int use_ssl,
     if ((sock = conn2(&fake_ctx, host, port, use_ssl, ebuf,
                       ebuf_len)) == INVALID_SOCKET) {
     } else if ((conn = (struct mg_connection *)
-                       calloc(1, sizeof(*conn) + MAX_REQUEST_SIZE)) == NULL) {
+                       mg_calloc(1, sizeof(*conn) + MAX_REQUEST_SIZE)) == NULL) {
         snprintf(ebuf, ebuf_len, "calloc(): %s", strerror(ERRNO));
         closesocket(sock);
+        sock = INVALID_SOCKET;
 #ifndef NO_SSL
     } else if (use_ssl && (conn->client_ssl_ctx =
                                SSL_CTX_new(SSLv23_client_method())) == NULL) {
         snprintf(ebuf, ebuf_len, "SSL_CTX_new error");
         closesocket(sock);
-        free(conn);
+        sock = INVALID_SOCKET;
+        mg_free(conn);
         conn = NULL;
 #endif /* NO_SSL */
     } else {
@@ -5860,13 +6231,19 @@ static int getreq(struct mg_connection *conn, char *ebuf, size_t ebuf_len)
                                   &conn->request_info) <= 0) {
         snprintf(ebuf, ebuf_len, "Bad request: [%.*s]", conn->data_len, conn->buf);
     } else {
-        /* Request is valid */
+        /* Message is a valid request or response */
         if ((cl = get_header(&conn->request_info, "Content-Length")) != NULL) {
+            /* Request/response has content length set */
             conn->content_len = strtoll(cl, NULL, 10);
         } else if (!mg_strcasecmp(conn->request_info.request_method, "POST") ||
                    !mg_strcasecmp(conn->request_info.request_method, "PUT")) {
+            /* POST or PUT request without content length set */
+            conn->content_len = -1;
+        } else if (!mg_strncasecmp(conn->request_info.request_method, "HTTP/", 5)) {
+            /* Response without content length set */
             conn->content_len = -1;
         } else {
+            /* Other request */
             conn->content_len = 0;
         }
         conn->birth_time = time(NULL);
@@ -5931,7 +6308,7 @@ static void process_new_connection(struct mg_connection *conn)
             log_access(conn);
         }
         if (ri->remote_user != NULL) {
-            free((void *) ri->remote_user);
+            mg_free((void *) ri->remote_user);
             /* Important! When having connections with and without auth
                would cause double free and then crash */
             ri->remote_user = NULL;
@@ -5998,7 +6375,7 @@ static void *worker_thread_run(void *thread_func_param)
     tls.pthread_cond_helper_mutex = CreateEvent(NULL, FALSE, FALSE, NULL);
 #endif
 
-    conn = (struct mg_connection *) calloc(1, sizeof(*conn) + MAX_REQUEST_SIZE);
+    conn = (struct mg_connection *) mg_calloc(1, sizeof(*conn) + MAX_REQUEST_SIZE);
     if (conn == NULL) {
         mg_cry(fc(ctx), "%s", "Cannot create new connection struct, OOM");
     } else {
@@ -6050,7 +6427,7 @@ static void *worker_thread_run(void *thread_func_param)
 #if defined(_WIN32) && !defined(__SYMBIAN32__)
     CloseHandle(tls.pthread_cond_helper_mutex);
 #endif
-    free(conn);
+    mg_free(conn);
 
     DEBUG_TRACE(("exiting"));
     return NULL;
@@ -6120,6 +6497,7 @@ static void accept_new_connection(const struct socket *listener,
         sockaddr_to_string(src_addr, sizeof(src_addr), &so.rsa);
         mg_cry(fc(ctx), "%s: %s is not allowed to connect", __func__, src_addr);
         closesocket(so.sock);
+        so.sock = INVALID_SOCKET;
     } else {
         /* Put so socket structure into the queue */
         DEBUG_TRACE(("Accepted socket %d", (int) so.sock));
@@ -6171,13 +6549,18 @@ static void master_thread_run(void *thread_func_param)
     }
 #endif
 
+    /* Initialize thread local storage */
 #if defined(_WIN32) && !defined(__SYMBIAN32__)
     tls.pthread_cond_helper_mutex = CreateEvent(NULL, FALSE, FALSE, NULL);
 #endif
     tls.is_master = 1;
     pthread_setspecific(sTlsKey, &tls);
 
-    pfd = (struct pollfd *) calloc(ctx->num_listening_sockets, sizeof(pfd[0]));
+    /* Server starts *now* */
+    ctx->start_time = (unsigned long)time(NULL);
+
+    /* Allocate memory for the listening sockets, and start the server */
+    pfd = (struct pollfd *) mg_calloc(ctx->num_listening_sockets, sizeof(pfd[0]));
     while (pfd != NULL && ctx->stop_flag == 0) {
         for (i = 0; i < ctx->num_listening_sockets; i++) {
             pfd[i].fd = ctx->listening_sockets[i].sock;
@@ -6197,7 +6580,7 @@ static void master_thread_run(void *thread_func_param)
             }
         }
     }
-    free(pfd);
+    mg_free(pfd);
     DEBUG_TRACE(("stopping workers"));
 
     /* Stop signal received: somebody called mg_stop. Quit. */
@@ -6219,12 +6602,6 @@ static void master_thread_run(void *thread_func_param)
         mg_join_thread(ctx->workerthreadids[i]);
     }
 
-    /* All threads exited, no sync is needed. Destroy mutex and condvars */
-    (void) pthread_mutex_destroy(&ctx->mutex);
-    (void) pthread_cond_destroy(&ctx->cond);
-    (void) pthread_cond_destroy(&ctx->sq_empty);
-    (void) pthread_cond_destroy(&ctx->sq_full);
-
 #if !defined(NO_SSL)
     uninitialize_ssl(ctx);
 #endif
@@ -6265,21 +6642,27 @@ static void free_context(struct mg_context *ctx)
     if (ctx == NULL)
         return;
 
+    /* All threads exited, no sync is needed. Destroy mutex and condvars */
+    (void) pthread_mutex_destroy(&ctx->mutex);
+    (void) pthread_cond_destroy(&ctx->cond);
+    (void) pthread_cond_destroy(&ctx->sq_empty);
+    (void) pthread_cond_destroy(&ctx->sq_full);
+
     /* Deallocate config parameters */
     for (i = 0; i < NUM_OPTIONS; i++) {
         if (ctx->config[i] != NULL)
 #ifdef WIN32
 #pragma warning(suppress: 6001)
 #endif
-            free(ctx->config[i]);
+            mg_free(ctx->config[i]);
     }
 
     /* Deallocate request handlers */
     while (ctx->request_handlers) {
         tmp_rh = ctx->request_handlers;
         ctx->request_handlers = tmp_rh->next;
-        free(tmp_rh->uri);
-        free(tmp_rh);
+        mg_free(tmp_rh->uri);
+        mg_free(tmp_rh);
     }
 
 #ifndef NO_SSL
@@ -6288,14 +6671,14 @@ static void free_context(struct mg_context *ctx)
         SSL_CTX_free(ctx->ssl_ctx);
     }
     if (ssl_mutexes != NULL) {
-        free(ssl_mutexes);
+        mg_free(ssl_mutexes);
         ssl_mutexes = NULL;
     }
 #endif /* !NO_SSL */
 
     /* Deallocate worker thread ID array */
     if (ctx->workerthreadids != NULL) {
-        free(ctx->workerthreadids);
+        mg_free(ctx->workerthreadids);
     }
 
     /* Deallocate the tls variable */
@@ -6304,8 +6687,11 @@ static void free_context(struct mg_context *ctx)
         pthread_key_delete(sTlsKey);
     }
 
+    /* deallocate system name string */
+    mg_free(ctx->systemName);
+
     /* Deallocate context itself */
-    free(ctx);
+    mg_free(ctx);
 }
 
 void mg_stop(struct mg_context *ctx)
@@ -6324,6 +6710,35 @@ void mg_stop(struct mg_context *ctx)
 #endif /* _WIN32 && !__SYMBIAN32__ */
 }
 
+void get_system_name(char **sysName)
+{
+#if defined(_WIN32)
+#if !defined(__SYMBIAN32__)
+    char name[128];
+    DWORD dwVersion = 0;
+    DWORD dwMajorVersion = 0;
+    DWORD dwMinorVersion = 0;
+    DWORD dwBuild = 0;
+
+    dwVersion = GetVersion();
+
+    dwMajorVersion = (DWORD)(LOBYTE(LOWORD(dwVersion)));
+    dwMinorVersion = (DWORD)(HIBYTE(LOWORD(dwVersion)));
+    dwBuild = ((dwVersion < 0x80000000) ? (DWORD)(HIWORD(dwVersion)) : 0);
+
+    sprintf(name, "Windows %d.%d", dwMajorVersion, dwMinorVersion);
+    *sysName = mg_strdup(name);
+#else
+    *sysName = mg_strdup("Symbian");
+#endif
+#else
+    struct utsname name;
+    memset(&name, 0, sizeof(name));
+    uname(&name);
+    *sysName = mg_strdup(name.sysname);
+#endif
+}
+
 struct mg_context *mg_start(const struct mg_callbacks *callbacks,
                             void *user_data,
                             const char **options)
@@ -6342,26 +6757,33 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
 
     /* Check if the config_options and the corresponding enum have compatible sizes. */
     /* Could use static_assert, once it is verified that all compilers support this. */
-    assert(sizeof(config_options)/sizeof(config_options[0]) == NUM_OPTIONS*2+1);
+    assert(sizeof(config_options)/sizeof(config_options[0]) == NUM_OPTIONS+1);
 
     /* Allocate context and initialize reasonable general case defaults.
        TODO(lsm): do proper error handling here. */
-    if ((ctx = (struct mg_context *) calloc(1, sizeof(*ctx))) == NULL) {
+    if ((ctx = (struct mg_context *) mg_calloc(1, sizeof(*ctx))) == NULL) {
         return NULL;
     }
 
     if (sTlsInit==0) {
         if (0 != pthread_key_create(&sTlsKey, NULL)) {
             mg_cry(fc(ctx), "Cannot initialize thread local storage");
+            mg_free(ctx);
             return NULL;
         }
         sTlsInit++;
     }
 
-    ctx->callbacks = *callbacks;
+    if (callbacks) {
+        ctx->callbacks = *callbacks;
+    }
     ctx->user_data = user_data;
     ctx->request_handlers = 0;
 
+#if defined(USE_LUA) && defined(USE_WEBSOCKET)
+    ctx->shared_lua_websockets = 0;
+#endif
+
     while (options && (name = *options++) != NULL) {
         if ((i = get_option_index(name)) == -1) {
             mg_cry(fc(ctx), "Invalid option: %s", name);
@@ -6374,20 +6796,22 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
         }
         if (ctx->config[i] != NULL) {
             mg_cry(fc(ctx), "warning: %s: duplicate option", name);
-            free(ctx->config[i]);
+            mg_free(ctx->config[i]);
         }
         ctx->config[i] = mg_strdup(value);
         DEBUG_TRACE(("[%s] -> [%s]", name, value));
     }
 
     /* Set default value if needed */
-    for (i = 0; config_options[i * 2] != NULL; i++) {
-        default_value = config_options[i * 2 + 1];
+    for (i = 0; config_options[i].name != NULL; i++) {
+        default_value = config_options[i].default_value;
         if (ctx->config[i] == NULL && default_value != NULL) {
             ctx->config[i] = mg_strdup(default_value);
         }
     }
 
+    get_system_name(&ctx->systemName);
+
     /* NOTE(lsm): order is important here. SSL certificates must
        be initialized before listening ports. UID must be set last. */
     if (!set_gpass_option(ctx) ||
@@ -6424,7 +6848,7 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
 
     if (workerthreadcount > 0) {
         ctx->workerthreadcount = workerthreadcount;
-        ctx->workerthreadids = calloc(workerthreadcount, sizeof(pthread_t));
+        ctx->workerthreadids = mg_calloc(workerthreadcount, sizeof(pthread_t));
         if (ctx->workerthreadids == NULL) {
             mg_cry(fc(ctx), "Not enough memory for worker thread ID array");
             free_context(ctx);
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 49c9c6f..5f750e6 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -532,7 +532,7 @@ void Client::update_inode_file_bits(Inode *in,
 
       // truncate cached file data
       if (prior_size > size) {
-	_invalidate_inode_cache(in, truncate_size, prior_size - truncate_size, true);
+	_invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
       }
     }
 
@@ -758,7 +758,7 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl
   
   if (!dn || dn->inode == 0) {
     in->get();
-    if (old_dentry)
+    if (old_dentry && old_dentry->dir)
       unlink(old_dentry, dir == old_dentry->dir);  // keep dir open if its the same dir
     dn = link(dir, dname, in, dn);
     put_inode(in);
@@ -1445,31 +1445,31 @@ int Client::encode_inode_release(Inode *in, MetaRequest *req,
 	   << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
 	   << ", have:" << ", force:" << force << ")" << dendl;
   int released = 0;
-  Cap *caps = NULL;
-  if (in->caps.count(mds))
-    caps = in->caps[mds];
-  if (caps &&
-      (drop & caps->issued) &&
-      !(unless & caps->issued)) {
-    ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
-    caps->issued &= ~drop;
-    caps->implemented &= ~drop;
-    released = 1;
-    force = 1;
-    ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
-  }
-  if (force && caps) {
-    ceph_mds_request_release rel;
-    rel.ino = in->ino;
-    rel.cap_id = caps->cap_id;
-    rel.seq = caps->seq;
-    rel.issue_seq = caps->issue_seq;
-    rel.mseq = caps->mseq;
-    rel.caps = caps->issued;
-    rel.wanted = caps->wanted;
-    rel.dname_len = 0;
-    rel.dname_seq = 0;
-    req->cap_releases.push_back(MClientRequest::Release(rel,""));
+  if (in->caps.count(mds)) {
+    Cap *caps = in->caps[mds];
+    drop &= ~(in->dirty_caps | get_caps_used(in));
+    if ((drop & caps->issued) &&
+	!(unless & caps->issued)) {
+      ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
+      caps->issued &= ~drop;
+      caps->implemented &= ~drop;
+      released = 1;
+      force = 1;
+      ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
+    }
+    if (force) {
+      ceph_mds_request_release rel;
+      rel.ino = in->ino;
+      rel.cap_id = caps->cap_id;
+      rel.seq = caps->seq;
+      rel.issue_seq = caps->issue_seq;
+      rel.mseq = caps->mseq;
+      rel.caps = caps->issued;
+      rel.wanted = caps->wanted;
+      rel.dname_len = 0;
+      rel.dname_seq = 0;
+      req->cap_releases.push_back(MClientRequest::Release(rel,""));
+    }
   }
   ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
 	   << released << dendl;
@@ -1610,9 +1610,10 @@ void Client::handle_client_session(MClientSession *m)
   case CEPH_SESSION_OPEN:
     renew_caps(session);
     session->state = MetaSession::STATE_OPEN;
-    if (!unmounting) {
+    if (unmounting)
+      mount_cond.Signal();
+    else
       connect_mds_targets(from);
-    }
     signal_context_list(session->waiting_for_open);
     break;
 
@@ -2203,8 +2204,12 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
   if (in) {    // link to inode
     dn->inode = in;
     in->get();
-    if (in->dir)
-      dn->get();  // dir -> dn pin
+    if (in->is_dir()) {
+      if (in->dir)
+	dn->get(); // dir -> dn pin
+      if (in->ll_ref)
+	dn->get(); // ll_ref -> dn pin
+    }
 
     assert(in->dn_set.count(dn) == 0);
 
@@ -2231,8 +2236,12 @@ void Client::unlink(Dentry *dn, bool keepdir)
 
   // unlink from inode
   if (in) {
-    if (in->dir)
-      dn->put();        // dir -> dn pin
+    if (in->is_dir()) {
+      if (in->dir)
+	dn->put(); // dir -> dn pin
+      if (in->ll_ref)
+	dn->put(); // ll_ref -> dn pin
+    }
     dn->inode = 0;
     assert(in->dn_set.count(dn));
     in->dn_set.erase(dn);
@@ -2264,41 +2273,57 @@ void Client::get_cap_ref(Inode *in, int cap)
     ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
     in->get();
   }
+  if ((cap & CEPH_CAP_FILE_CACHE) &&
+      in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
+    ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
+    in->get();
+  }
   in->get_cap_ref(cap);
 }
 
 void Client::put_cap_ref(Inode *in, int cap)
 {
-  bool last = in->put_cap_ref(cap);
+  int last = in->put_cap_ref(cap);
   if (last) {
+    int put_nref = 0;
+    int drop = last & ~in->caps_issued();
     if (in->snapid == CEPH_NOSNAP) {
-      if ((cap & CEPH_CAP_FILE_WR) &&
+      if ((last & CEPH_CAP_FILE_WR) &&
 	  !in->cap_snaps.empty() &&
 	  in->cap_snaps.rbegin()->second->writing) {
 	ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
 	in->cap_snaps.rbegin()->second->writing = 0;
-	finish_cap_snap(in, in->cap_snaps.rbegin()->second, in->caps_used());
+	finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
 	signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
       }
-      if (cap & CEPH_CAP_FILE_BUFFER) {
+      if (last & CEPH_CAP_FILE_BUFFER) {
 	for (map<snapid_t,CapSnap*>::iterator p = in->cap_snaps.begin();
 	    p != in->cap_snaps.end();
 	    ++p)
 	  p->second->dirty_data = 0;
-	check_caps(in, false);
 	signal_cond_list(in->waitfor_commit);
 	ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
-	put_inode(in);
+	++put_nref;
       }
     }
-    if (cap & CEPH_CAP_FILE_CACHE) {
-      check_caps(in, false);
+    if (last & CEPH_CAP_FILE_CACHE) {
       ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
+      ++put_nref;
+      // release clean pages too, if we dont want RDCACHE
+      if (!(in->caps_wanted() & CEPH_CAP_FILE_CACHE))
+	drop |= CEPH_CAP_FILE_CACHE;
+    }
+    if (drop) {
+      if (drop & CEPH_CAP_FILE_CACHE)
+	_invalidate_inode_cache(in);
+      else
+	check_caps(in, false);
     }
+    if (put_nref)
+      put_inode(in, put_nref);
   }
 }
 
-
 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
 {
   while (1) {
@@ -2338,6 +2363,14 @@ int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
   }
 }
 
+int Client::get_caps_used(Inode *in)
+{
+  unsigned used = in->caps_used();
+  if (!(used & CEPH_CAP_FILE_CACHE) &&
+      !objectcacher->set_is_empty(&in->oset))
+    used |= CEPH_CAP_FILE_CACHE;
+  return used;
+}
 
 void Client::cap_delay_requeue(Inode *in)
 {
@@ -2386,7 +2419,7 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
 				   in->ino,
 				   0,
 				   cap->cap_id, cap->seq,
-				   cap->issued,
+				   cap->implemented,
 				   want,
 				   flush,
 				   cap->mseq);
@@ -2434,10 +2467,10 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
 void Client::check_caps(Inode *in, bool is_delayed)
 {
   unsigned wanted = in->caps_wanted();
-  unsigned used = in->caps_used();
+  unsigned used = get_caps_used(in);
   unsigned cap_used;
 
-  int retain = wanted | CEPH_CAP_PIN;
+  int retain = wanted | used | CEPH_CAP_PIN;
   if (!unmounting) {
     if (wanted)
       retain |= CEPH_CAP_ANY;
@@ -2548,7 +2581,7 @@ struct C_SnapFlush : public Context {
 
 void Client::queue_cap_snap(Inode *in, snapid_t seq)
 {
-  int used = in->caps_used();
+  int used = get_caps_used(in);
   int dirty = in->caps_dirty();
   ldout(cct, 10) << "queue_cap_snap " << *in << " seq " << seq << " used " << ccap_string(used) << dendl;
 
@@ -2741,9 +2774,8 @@ void Client::_async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_ca
   ino_invalidate_cb(ino_invalidate_cb_handle, in->vino(), off, len);
 
   client_lock.Lock();
-  if (!keep_caps) {
-    put_cap_ref(in, CEPH_CAP_FILE_CACHE);
-  }
+  if (!keep_caps)
+    check_caps(in, false);
   put_inode(in);
   client_lock.Unlock();
   ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << " done" << dendl;
@@ -2755,11 +2787,10 @@ void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len,
     // we queue the invalidate, which calls the callback and decrements the ref
     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len, keep_caps));
   else if (!keep_caps)
-    // if not set, we just decrement the cap ref here
-    in->put_cap_ref(CEPH_CAP_FILE_CACHE);
+    check_caps(in, false);
 }
 
-void Client::_invalidate_inode_cache(Inode *in, bool keep_caps)
+void Client::_invalidate_inode_cache(Inode *in)
 {
   ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
 
@@ -2767,10 +2798,10 @@ void Client::_invalidate_inode_cache(Inode *in, bool keep_caps)
   if (cct->_conf->client_oc)
     objectcacher->release_set(&in->oset);
 
-  _schedule_invalidate_callback(in, 0, 0, keep_caps);
+  _schedule_invalidate_callback(in, 0, 0, false);
 }
 
-void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps)
+void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
 {
   ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
 
@@ -2781,14 +2812,14 @@ void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool k
     objectcacher->discard_set(&in->oset, ls);
   }
 
-  _schedule_invalidate_callback(in, off, len, keep_caps);
+  _schedule_invalidate_callback(in, off, len, true);
 }
 
 void Client::_release(Inode *in)
 {
   ldout(cct, 20) << "_release " << *in << dendl;
-  if (in->cap_refs[CEPH_CAP_FILE_CACHE]) {
-    _invalidate_inode_cache(in, false);
+  if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
+    _invalidate_inode_cache(in);
   }
 }
 
@@ -2860,12 +2891,7 @@ void Client::_flushed(Inode *in)
 {
   ldout(cct, 10) << "_flushed " << *in << dendl;
 
-  // release clean pages too, if we dont hold RDCACHE reference
-  if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
-    _invalidate_inode_cache(in, true);
-  }
-
-  put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
+  put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
 }
 
 
@@ -3047,7 +3073,7 @@ void Client::trim_caps(MetaSession *s, int max)
       int mine = cap->issued | cap->implemented;
       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
       // disposable non-auth cap
-      if (!(in->caps_used() & ~oissued & mine)) {
+      if (!(get_caps_used(in) & ~oissued & mine)) {
 	ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
 	remove_cap(cap, true);
 	trimmed++;
@@ -3074,6 +3100,17 @@ void Client::trim_caps(MetaSession *s, int max)
     }
   }
   s->s_cap_iterator = NULL;
+
+  // notify kernel to invalidate top level directory entries. As a side effect,
+  // unused inodes underneath these entries get pruned.
+  if (dentry_invalidate_cb && s->caps.size() > max) {
+    for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
+	 p != root->dir->dentries.end();
+	 ++p) {
+      if (p->second->inode)
+	_schedule_invalidate_dentry_callback(p->second, false);
+    }
+  }
 }
 
 void Client::mark_caps_dirty(Inode *in, int caps)
@@ -3135,10 +3172,8 @@ void Client::flush_caps(Inode *in, MetaSession *session)
   Cap *cap = in->auth_cap;
   assert(cap->session == session);
 
-  int wanted = in->caps_wanted();
-  int retain = wanted | CEPH_CAP_PIN;
-
-  send_cap(in, session, cap, in->caps_used(), wanted, retain, in->flushing_caps);
+  send_cap(in, session, cap, get_caps_used(in), in->caps_wanted(),
+	   (cap->issued | cap->implemented), in->flushing_caps);
 }
 
 void Client::wait_sync_caps(uint64_t want)
@@ -3664,9 +3699,14 @@ private:
   vinodeno_t ino;
   string name;
 public:
-  C_Client_DentryInvalidate(Client *c, Dentry *dn) :
-			    client(c), dirino(dn->dir->parent_inode->vino()),
-			    ino(dn->inode->vino()), name(dn->name) { }
+  C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
+    client(c), name(dn->name) {
+      dirino = dn->dir->parent_inode->vino();
+      if (del)
+	ino = dn->inode->vino();
+      else
+	ino.ino = inodeno_t();
+  }
   void finish(int r) {
     client->_async_dentry_invalidate(dirino, ino, name);
   }
@@ -3679,10 +3719,10 @@ void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string&
   dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name);
 }
 
-void Client::_schedule_invalidate_dentry_callback(Dentry *dn)
+void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
 {
   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
-    async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn));
+    async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
 }
 
 void Client::_invalidate_inode_parents(Inode *in)
@@ -3692,7 +3732,7 @@ void Client::_invalidate_inode_parents(Inode *in)
     Dentry *dn = *q++;
     // FIXME: we play lots of unlink/link tricks when handling MDS replies,
     //        so in->dn_set doesn't always reflect the state of kernel's dcache.
-    _schedule_invalidate_dentry_callback(dn);
+    _schedule_invalidate_dentry_callback(dn, true);
     unlink(dn, false);
   }
 }
@@ -3700,7 +3740,7 @@ void Client::_invalidate_inode_parents(Inode *in)
 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
 {
   int mds = session->mds_num;
-  int used = in->caps_used();
+  int used = get_caps_used(in);
   int wanted = in->caps_wanted();
 
   const int old_caps = cap->issued;
@@ -3724,7 +3764,7 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
     in->gid = m->head.gid;
   }
   bool deleted_inode = false;
-  if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+  if ((issued & CEPH_CAP_LINK_EXCL) == 0 && in->nlink != (int32_t)m->head.nlink) {
     in->nlink = m->head.nlink;
     if (in->nlink == 0 &&
 	(new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -3764,12 +3804,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
     cap->issued = new_caps;
     cap->implemented |= new_caps;
 
-    if ((~cap->issued & old_caps) & CEPH_CAP_FILE_CACHE)
-      _release(in);
     
     if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER) &&
 	!_flush(in)) {
       // waitin' for flush
+    } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
+      _release(in);
     } else {
       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
       check = true;
@@ -3796,7 +3836,7 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   }
 
   if (check)
-    check_caps(in, true);
+    check_caps(in, false);
 
   // wake up waiters
   if (new_caps)
@@ -5789,7 +5829,13 @@ int Client::_release_fh(Fh *f)
   if (in->snapid == CEPH_NOSNAP) {
     if (in->put_open_ref(f->mode)) {
       _flush(in);
-      check_caps(in, false);
+      // release clean pages too, if we dont want RDCACHE
+      if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0 &&
+	  !(in->caps_wanted() & CEPH_CAP_FILE_CACHE) &&
+	  !objectcacher->set_is_empty(&in->oset))
+	_invalidate_inode_cache(in);
+      else
+	check_caps(in, false);
     }
   } else {
     assert(in->snap_cap_refs > 0);
@@ -6014,6 +6060,14 @@ int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
 
   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
 
+  bool movepos = false;
+  if (offset < 0) {
+    lock_fh_pos(f);
+    offset = f->pos;
+    movepos = true;
+  }
+  loff_t start_pos = offset;
+
   if (in->inline_version == 0) {
     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, -1, -1, true);
     if (r < 0)
@@ -6021,18 +6075,12 @@ int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
     assert(in->inline_version > 0);
   }
 
+retry:
   int have;
   int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
   if (r < 0)
     return r;
 
-  bool movepos = false;
-  if (offset < 0) {
-    lock_fh_pos(f);
-    offset = f->pos;
-    movepos = true;
-  }
-
   Mutex uninline_flock("Clinet::_read_uninline_data flock");
   Cond uninline_cond;
   bool uninline_done = false;
@@ -6075,25 +6123,33 @@ int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
       _flush_range(in, offset, size);
     }
     r = _read_async(f, offset, size, bl);
+    if (r < 0)
+      goto done;
   } else {
-    r = _read_sync(f, offset, size, bl);
-  }
-
-  // don't move pointer if the read failed
-  if (r < 0) {
-    goto done;
-  }
+    bool checkeof = false;
+    r = _read_sync(f, offset, size, bl, &checkeof);
+    if (r < 0)
+      goto done;
+    if (checkeof) {
+      offset += r;
+      size -= r;
 
-success:
+      put_cap_ref(in, CEPH_CAP_FILE_RD);
+      have = 0;
+      // reverify size
+      r = _getattr(in, CEPH_STAT_CAP_SIZE);
+      if (r < 0)
+	goto done;
 
-  if (movepos) {
-    // adjust fd pos
-    f->pos = offset+bl->length();
-    unlock_fh_pos(f);
+      // eof?  short read.
+      if ((uint64_t)offset < in->size)
+	goto retry;
+    }
   }
 
+success:
   // adjust readahead state
-  if (f->last_pos != offset) {
+  if (f->last_pos != start_pos) {
     f->nr_consec_read = f->consec_read_bytes = 0;
   } else {
     f->nr_consec_read++;
@@ -6101,9 +6157,15 @@ success:
   f->consec_read_bytes += bl->length();
   ldout(cct, 10) << "readahead nr_consec_read " << f->nr_consec_read
 	   << " for " << f->consec_read_bytes << " bytes" 
-	   << " .. last_pos " << f->last_pos << " .. offset " << offset
-	   << dendl;
-  f->last_pos = offset+bl->length();
+	   << " .. last_pos " << f->last_pos << " .. offset "
+	   << start_pos << dendl;
+
+  f->last_pos = start_pos + bl->length();
+  if (movepos) {
+    // adjust fd pos
+    f->pos = f->last_pos;
+    unlock_fh_pos(f);
+  }
 
 done:
   // done!
@@ -6125,8 +6187,9 @@ done:
       r = uninline_ret;
   }
 
-  put_cap_ref(in, CEPH_CAP_FILE_RD);
-  return r;
+  if (have)
+    put_cap_ref(in, CEPH_CAP_FILE_RD);
+  return r < 0 ? r : bl->length();
 }
 
 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
@@ -6145,10 +6208,6 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
     readahead = false;
   }
 
-  // we will populate the cache here
-  if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0)
-    in->get_cap_ref(CEPH_CAP_FILE_CACHE);
-  
   ldout(cct, 10) << "readahead=" << readahead << " nr_consec=" << f->nr_consec_read
 	   << " max_byes=" << conf->client_readahead_max_bytes
 	   << " max_periods=" << conf->client_readahead_max_periods << dendl;
@@ -6196,7 +6255,7 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
 					NULL, 0, onfinish);
 	if (r == 0) {
 	  ldout(cct, 20) << "readahead initiated, c " << onfinish << dendl;
-	  in->get();
+	  get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
 	} else {
 	  ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
 	  delete onfinish;
@@ -6214,12 +6273,14 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
 			      off, len, bl, 0, onfinish);
   if (r == 0) {
+    get_cap_ref(in, CEPH_CAP_FILE_CACHE);
     client_lock.Unlock();
     flock.Lock();
     while (!done)
       cond.Wait(flock);
     flock.Unlock();
     client_lock.Lock();
+    put_cap_ref(in, CEPH_CAP_FILE_CACHE);
     r = rvalue;
   } else {
     // it was cached.
@@ -6228,7 +6289,8 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
   return r;
 }
 
-int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
+int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
+		       bool *checkeof)
 {
   Inode *in = f->inode;
   uint64_t pos = off;
@@ -6287,14 +6349,8 @@ int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
 	  return read;
       }
 
-      // reverify size
-      r = _getattr(in, CEPH_STAT_CAP_SIZE);
-      if (r < 0)
-	return r;
-
-      // eof?  short read.
-      if (pos >= in->size)
-	return read;
+      *checkeof = true;
+      return read;
     }
   }
   return read;
@@ -6451,7 +6507,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf)
   if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
     // do buffered write
     if (!in->oset.dirty_or_tx)
-      get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
+      get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
 
     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
@@ -7015,8 +7071,13 @@ int Client::ll_walk(const char* name, Inode **i, struct stat *attr)
 
 void Client::_ll_get(Inode *in)
 {
-  if (in->ll_ref == 0)
+  if (in->ll_ref == 0) {
     in->get();
+    if (in->is_dir() && !in->dn_set.empty()) {
+      assert(in->dn_set.size() == 1); // dirs can't be hard-linked
+      in->get_first_parent()->get(); // pin dentry
+    }
+  }
   in->ll_get();
   ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
 }
@@ -7026,6 +7087,10 @@ int Client::_ll_put(Inode *in, int num)
   in->ll_put(num);
   ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
   if (in->ll_ref == 0) {
+    if (in->is_dir() && !in->dn_set.empty()) {
+      assert(in->dn_set.size() == 1); // dirs can't be hard-linked
+      in->get_first_parent()->put(); // unpin dentry
+    }
     put_inode(in);
     return 0;
   } else {
@@ -7065,8 +7130,8 @@ bool Client::ll_forget(Inode *in, int count)
     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
 		  << ", which only has ll_ref=" << in->ll_ref << dendl;
     _ll_put(in, in->ll_ref);
-      last = true;
-    } else {
+    last = true;
+  } else {
     if (_ll_put(in, count) == 0)
       last = true;
   }
@@ -8550,7 +8615,7 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
       unsafe_sync_write++;
       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
-      _invalidate_inode_cache(in, offset, length, true);
+      _invalidate_inode_cache(in, offset, length);
       r = filer->zero(in->ino, &in->layout,
                       in->snaprealm->get_snap_context(),
                       offset, length,
diff --git a/src/client/Client.h b/src/client/Client.h
index 458dd4c..4a3d753 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -455,6 +455,7 @@ protected:
   void kick_flushing_caps(MetaSession *session);
   void kick_maxsize_requests(MetaSession *session);
   int get_caps(Inode *in, int need, int want, int *have, loff_t endoff);
+  int get_caps_used(Inode *in);
 
   void maybe_update_snaprealm(SnapRealm *realm, snapid_t snap_created, snapid_t snap_highwater, 
 			      vector<snapid_t>& snaps);
@@ -479,13 +480,13 @@ protected:
   void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
   void _flushed_cap_snap(Inode *in, snapid_t seq);
 
-  void _schedule_invalidate_dentry_callback(Dentry *dn);
+  void _schedule_invalidate_dentry_callback(Dentry *dn, bool del);
   void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
   void _invalidate_inode_parents(Inode *in);
 
   void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
-  void _invalidate_inode_cache(Inode *in, bool keep_caps);
-  void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
+  void _invalidate_inode_cache(Inode *in);
+  void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len);
   void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps);
   void _release(Inode *in);
   
@@ -565,11 +566,11 @@ private:
 	inode(i) { }
     void finish(int r) {
       lsubdout(client->cct, client, 20) << "C_Readahead on " << inode << dendl;
-      client->put_inode(inode, 1);
+      client->put_cap_ref(inode, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
     }
   };
 
-  int _read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl);
+  int _read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, bool *checkeof);
   int _read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl);
 
   // internal interface
diff --git a/src/client/Inode.cc b/src/client/Inode.cc
index 0d9d9d9..2f6389d 100644
--- a/src/client/Inode.cc
+++ b/src/client/Inode.cc
@@ -14,7 +14,7 @@ ostream& operator<<(ostream &out, Inode &in)
       << " cap_refs=" << in.cap_refs
       << " open=" << in.open_by_mode
       << " mode=" << oct << in.mode << dec
-      << " size=" << in.size
+      << " size=" << in.size << "/" << in.max_size
       << " mtime=" << in.mtime
       << " caps=" << ccap_string(in.caps_issued());
   if (!in.caps.empty()) {
@@ -111,12 +111,12 @@ void Inode::get_cap_ref(int cap)
   }
 }
 
-bool Inode::put_cap_ref(int cap)
+int Inode::put_cap_ref(int cap)
 {
   // if cap is always a single bit (which it seems to be)
   // all this logic is equivalent to:
   // if (--cap_refs[c]) return false; else return true;
-  bool last = false;
+  int last = 0;
   int n = 0;
   while (cap) {
     if (cap & 1) {
@@ -126,7 +126,7 @@ bool Inode::put_cap_ref(int cap)
 	assert(cap_refs[c] > 0);
       }
       if (--cap_refs[c] == 0)
-        last = true;
+        last |= c;
       //cout << "inode " << *this << " put " << cap_string(c) << " " << (cap_refs[c]+1) << " -> " << cap_refs[c] << std::endl;
     }
     cap >>= 1;
diff --git a/src/client/Inode.h b/src/client/Inode.h
index d8e001f..221a91a 100644
--- a/src/client/Inode.h
+++ b/src/client/Inode.h
@@ -251,7 +251,7 @@ class Inode {
   bool put_open_ref(int mode);
 
   void get_cap_ref(int cap);
-  bool put_cap_ref(int cap);
+  int put_cap_ref(int cap);
   bool is_any_caps();
   bool cap_is_valid(Cap* cap);
   int caps_issued(int *implemented = 0);
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index 44dd2a8..cd0f03e 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -33,6 +33,7 @@ using namespace std;
 #include <math.h>
 #include <sys/statvfs.h>
 
+#include "common/errno.h"
 #include "include/assert.h"
 
 #define dout_subsys ceph_subsys_client
@@ -318,16 +319,14 @@ int SyntheticClient::run()
   dout(15) << "initing" << dendl;
   int err = client->init();
   if (err < 0) {
-    char buf[80];
-    dout(0) << "failed to initialize: " << strerror_r(-err, buf, sizeof(buf)) << dendl;
+    dout(0) << "failed to initialize: " << cpp_strerror(err) << dendl;
     return -1;
   }
 
   dout(15) << "mounting" << dendl;
   err = client->mount("");
   if (err < 0) {
-    char buf[80];
-    dout(0) << "failed to mount: " << strerror_r(-err, buf, sizeof(buf)) << dendl;
+    dout(0) << "failed to mount: " << cpp_strerror(err) << dendl;
     client->shutdown();
     return -1;
   }
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 14cb6d4..7f419c3 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -680,7 +680,9 @@ static void dentry_invalidate_cb(void *handle, vinodeno_t dirino,
   CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
   fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid);
 #if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
-  fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
+  fuse_ino_t fino = 0;
+  if (ino.ino != inodeno_t())
+    fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
   fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length());
 #elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
   fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length());
diff --git a/src/cls/lock/cls_lock.cc b/src/cls/lock/cls_lock.cc
index 5f27c3c..b4772e0 100644
--- a/src/cls/lock/cls_lock.cc
+++ b/src/cls/lock/cls_lock.cc
@@ -21,6 +21,7 @@
 #include "include/utime.h"
 #include "objclass/objclass.h"
 
+#include "common/errno.h"
 #include "common/Clock.h"
 
 #include "cls/lock/cls_lock_types.h"
@@ -175,7 +176,7 @@ static int lock_obj(cls_method_context_t hctx,
   // see if there's already a locker
   int r = read_lock(hctx, name, &linfo);
   if (r < 0 && r != -ENOENT) {
-    CLS_ERR("Could not read lock info: %s", strerror(r));
+    CLS_ERR("Could not read lock info: %s", cpp_strerror(r).c_str());
     return r;
   }
   map<locker_id_t, locker_info_t>& lockers = linfo.lockers;
@@ -282,7 +283,7 @@ static int remove_lock(cls_method_context_t hctx,
   lock_info_t linfo;
   int r = read_lock(hctx, name, &linfo);
   if (r < 0) {
-    CLS_ERR("Could not read list of current lockers off disk: %s", strerror(r));
+    CLS_ERR("Could not read list of current lockers off disk: %s", cpp_strerror(r).c_str());
     return r;
   }
 
@@ -381,7 +382,7 @@ static int get_info(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   lock_info_t linfo;
   int r = read_lock(hctx, op.name, &linfo);
   if (r < 0) {
-    CLS_ERR("Could not read lock info: %s", strerror(r));
+    CLS_ERR("Could not read lock info: %s", cpp_strerror(r).c_str());
     return r;
   }
 
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 9348d5d..709036c 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -37,6 +37,7 @@
 #include <sstream>
 #include <vector>
 
+#include "common/errno.h"
 #include "objclass/objclass.h"
 #include "include/rbd_types.h"
 
@@ -115,7 +116,11 @@ static int snap_read_header(cls_method_context_t hctx, bufferlist& bl)
     if (rc < 0)
       return rc;
 
+    if (bl.length() < sizeof(*header))
+      return -EINVAL;
+
     header = (struct rbd_obj_header_ondisk *)bl.c_str();
+    assert(header);
 
     if ((snap_count != header->snap_count) ||
         (snap_names_len != header->snap_names_len)) {
@@ -290,7 +295,7 @@ int get_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   if (snap_id == CEPH_NOSNAP) {
     int r = read_key(hctx, "features", &features);
     if (r < 0) {
-      CLS_ERR("failed to read features off disk: %s", strerror(r));
+      CLS_ERR("failed to read features off disk: %s", cpp_strerror(r).c_str());
       return r;
     }
   } else {
@@ -359,14 +364,14 @@ int get_size(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
 
   int r = read_key(hctx, "order", &order);
   if (r < 0) {
-    CLS_ERR("failed to read the order off of disk: %s", strerror(r));
+    CLS_ERR("failed to read the order off of disk: %s", cpp_strerror(r).c_str());
     return r;
   }
 
   if (snap_id == CEPH_NOSNAP) {
     r = read_key(hctx, "size", &size);
     if (r < 0) {
-      CLS_ERR("failed to read the image's size off of disk: %s", strerror(r));
+      CLS_ERR("failed to read the image's size off of disk: %s", cpp_strerror(r).c_str());
       return r;
     }
   } else {
@@ -409,7 +414,7 @@ int set_size(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   uint64_t orig_size;
   int r = read_key(hctx, "size", &orig_size);
   if (r < 0) {
-    CLS_ERR("Could not read image's size off disk: %s", strerror(r));
+    CLS_ERR("Could not read image's size off disk: %s", cpp_strerror(r).c_str());
     return r;
   }
 
@@ -615,7 +620,7 @@ int get_stripe_unit_count(cls_method_context_t hctx, bufferlist *in, bufferlist
     uint8_t order;
     r = read_key(hctx, "order", &order);
     if (r < 0) {
-      CLS_ERR("failed to read the order off of disk: %s", strerror(r));
+      CLS_ERR("failed to read the order off of disk: %s", cpp_strerror(r).c_str());
       return -EIO;
     }
     stripe_unit = 1ull << order;
@@ -674,7 +679,7 @@ int set_stripe_unit_count(cls_method_context_t hctx, bufferlist *in, bufferlist
   uint8_t order;
   r = read_key(hctx, "order", &order);
   if (r < 0) {
-    CLS_ERR("failed to read the order off of disk: %s", strerror(r));
+    CLS_ERR("failed to read the order off of disk: %s", cpp_strerror(r).c_str());
     return r;
   }
   if ((1ull << order) % stripe_unit || stripe_unit > (1ull << order)) {
@@ -1121,7 +1126,7 @@ int get_snapcontext(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   uint64_t snap_seq;
   r = read_key(hctx, "snap_seq", &snap_seq);
   if (r < 0) {
-    CLS_ERR("could not read the image's snap_seq off disk: %s", strerror(r));
+    CLS_ERR("could not read the image's snap_seq off disk: %s", cpp_strerror(r).c_str());
     return r;
   }
 
@@ -1147,7 +1152,7 @@ int get_object_prefix(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   int r = read_key(hctx, "object_prefix", &object_prefix);
   if (r < 0) {
     CLS_ERR("failed to read the image's object prefix off of disk: %s",
-            strerror(r));
+            cpp_strerror(r).c_str());
     return r;
   }
 
@@ -1218,7 +1223,7 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   uint64_t cur_snap_seq;
   int r = read_key(hctx, "snap_seq", &cur_snap_seq);
   if (r < 0) {
-    CLS_ERR("Could not read image's snap_seq off disk: %s", strerror(r));
+    CLS_ERR("Could not read image's snap_seq off disk: %s", cpp_strerror(r).c_str());
     return r;
   }
 
@@ -1229,12 +1234,12 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
 
   r = read_key(hctx, "size", &snap_meta.image_size);
   if (r < 0) {
-    CLS_ERR("Could not read image's size off disk: %s", strerror(r));
+    CLS_ERR("Could not read image's size off disk: %s", cpp_strerror(r).c_str());
     return r;
   }
   r = read_key(hctx, "features", &snap_meta.features);
   if (r < 0) {
-    CLS_ERR("Could not read image's features off disk: %s", strerror(r));
+    CLS_ERR("Could not read image's features off disk: %s", cpp_strerror(r).c_str());
     return r;
   }
 
diff --git a/src/common/Preforker.h b/src/common/Preforker.h
index 00a4d6a..c28fd13 100644
--- a/src/common/Preforker.h
+++ b/src/common/Preforker.h
@@ -9,6 +9,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include "common/safe_io.h"
+#include "common/errno.h"
 
 /**
  * pre-fork fork/daemonize helper class
@@ -68,7 +69,7 @@ public:
       ::close(2);
       r = 0;
     } else if (err) {
-      cerr << "[" << getpid() << "]: " << cpp_strerror(-err) << std::endl;
+      cerr << "[" << getpid() << "]: " << cpp_strerror(err) << std::endl;
     } else {
       // wait for child to exit
       waitpid(childpid, NULL, 0);
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
index 6fabc8c..f901ac0 100644
--- a/src/common/RWLock.h
+++ b/src/common/RWLock.h
@@ -24,7 +24,7 @@ class RWLock
 {
   mutable pthread_rwlock_t L;
   const char *name;
-  int id;
+  mutable int id;
 
 public:
   RWLock(const RWLock& other);
@@ -40,25 +40,25 @@ public:
     pthread_rwlock_destroy(&L);
   }
 
-  void unlock() {
+  void unlock() const {
     if (g_lockdep) id = lockdep_will_unlock(name, id);
     pthread_rwlock_unlock(&L);
   }
 
   // read
-  void get_read() {
+  void get_read() const {
     if (g_lockdep) id = lockdep_will_lock(name, id);
     pthread_rwlock_rdlock(&L);
     if (g_lockdep) id = lockdep_locked(name, id);
   }
-  bool try_get_read() {
+  bool try_get_read() const {
     if (pthread_rwlock_tryrdlock(&L) == 0) {
       if (g_lockdep) id = lockdep_locked(name, id);
       return true;
     }
     return false;
   }
-  void put_read() {
+  void put_read() const {
     unlock();
   }
 
@@ -81,10 +81,10 @@ public:
 
 public:
   class RLocker {
-    RWLock &m_lock;
+    const RWLock &m_lock;
 
   public:
-    RLocker(RWLock& lock) : m_lock(lock) {
+    RLocker(const RWLock& lock) : m_lock(lock) {
       m_lock.get_read();
     }
     ~RLocker() {
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 71f665d..35c5d36 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -20,9 +20,9 @@
 #include "common/simple_spin.h"
 #include "common/strtol.h"
 #include "include/atomic.h"
+#include "common/Mutex.h"
 #include "include/types.h"
 #include "include/compat.h"
-#include "include/Spinlock.h"
 
 #include <errno.h>
 #include <fstream>
@@ -123,12 +123,16 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE
     unsigned len;
     atomic_t nref;
 
-    Spinlock crc_lock;
+    mutable Mutex crc_lock;
     map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map;
 
-    raw(unsigned l) : data(NULL), len(l), nref(0)
+    raw(unsigned l)
+      : data(NULL), len(l), nref(0),
+	crc_lock("buffer::raw::crc_lock", false, false)
     { }
-    raw(char *c, unsigned l) : data(c), len(l), nref(0)
+    raw(char *c, unsigned l)
+      : data(c), len(l), nref(0),
+	crc_lock("buffer::raw::crc_lock", false, false)
     { }
     virtual ~raw() {};
 
@@ -159,7 +163,7 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE
     }
     bool get_crc(const pair<size_t, size_t> &fromto,
 		 pair<uint32_t, uint32_t> *crc) const {
-      Spinlock::Locker l(crc_lock);
+      Mutex::Locker l(crc_lock);
       map<pair<size_t, size_t>, pair<uint32_t, uint32_t> >::const_iterator i =
 	crc_map.find(fromto);
       if (i == crc_map.end())
@@ -169,11 +173,11 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE
     }
     void set_crc(const pair<size_t, size_t> &fromto,
 		 const pair<uint32_t, uint32_t> &crc) {
-      Spinlock::Locker l(crc_lock);
+      Mutex::Locker l(crc_lock);
       crc_map[fromto] = crc;
     }
     void invalidate_crc() {
-      Spinlock::Locker l(crc_lock);
+      Mutex::Locker l(crc_lock);
       crc_map.clear();
     }
   };
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index 6538b18..1f97e29 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -121,7 +121,6 @@ void common_init_finish(CephContext *cct, int flags)
 
   if (cct->_conf->lockdep) {
     g_lockdep = true;
-    ldout(cct,0) << "lockdep is enabled" << dendl;
     lockdep_register_ceph_context(cct);
   }
 }
diff --git a/src/common/config.cc b/src/common/config.cc
index 4b85b6b..0ee7f58 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -25,6 +25,7 @@
 #include "include/stringify.h"
 #include "msg/msg_types.h"
 #include "osd/osd_types.h"
+#include "common/errno.h"
 
 #include "include/assert.h"
 
@@ -439,7 +440,7 @@ int md_config_t::parse_argv(std::vector<const char*>& args)
 	  show_config_value_arg << "': option not found" << std::endl;
       else
 	std::cerr << "failed to get config option '" <<
-	  show_config_value_arg << "': " << strerror(-r) << std::endl;
+	  show_config_value_arg << "': " << cpp_strerror(r) << std::endl;
       _exit(1);
     }
     string s = buf;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index ff674cf..a065a77 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -176,6 +176,7 @@ OPTION(mon_osd_report_timeout, OPT_INT, 900)    // grace period before declaring
 OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
 OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
 OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
+OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
 OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
 OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
 OPTION(mon_max_log_epochs, OPT_INT, 500)
@@ -219,6 +220,7 @@ OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compr
 OPTION(mon_leveldb_paranoid, OPT_BOOL, false)   // monitor's leveldb paranoid flag
 OPTION(mon_leveldb_log, OPT_STR, "")
 OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes)
+OPTION(mon_force_quorum_join, OPT_BOOL, false) // force monitor to join quorum even if it has been previously removed from the map
 OPTION(paxos_stash_full_interval, OPT_INT, 25)   // how often (in commits) to stash a full copy of the PaxosService state
 OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores
 OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0)  // gather updates for this long before proposing a map update
diff --git a/src/common/errno.cc b/src/common/errno.cc
index a981ab7..da06991 100644
--- a/src/common/errno.cc
+++ b/src/common/errno.cc
@@ -1,17 +1,30 @@
 #include "common/errno.h"
+#include "acconfig.h"
 
 #include <sstream>
 #include <string>
+
 #include <string.h>
 
 std::string cpp_strerror(int err)
 {
   char buf[128];
+  char *errmsg;
 
   if (err < 0)
     err = -err;
   std::ostringstream oss;
-  oss << "(" << err << ") " << strerror_r(err, buf, sizeof(buf));
+  buf[0] = '\0';
+
+  // strerror_r returns char * on Linux, and does not always fill buf
+#ifdef STRERROR_R_CHAR_P
+  errmsg = strerror_r(err, buf, sizeof(buf));
+#else
+  strerror_r(err, buf, sizeof(buf));
+  errmsg = buf;
+#endif
+
+  oss << "(" << err << ") " << errmsg;
 
   return oss.str();
 }
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 7067599..9098918 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -51,20 +51,6 @@ static std::string generate_object_name(int objnum, int pid = 0)
   return oss.str();
 }
 
-static std::string generate_metadata_name(int pid = 0)
-{
-  if (!pid)
-    pid = getpid();
-
-  char hostname[30];
-  gethostname(hostname, sizeof(hostname)-1);
-  hostname[sizeof(hostname)-1] = 0;
-
-  std::ostringstream oss;
-  oss << BENCH_PREFIX << "_" << hostname << "_" << pid << "_metadata";
-  return oss.str();
-}
-
 static void sanitize_object_contents (bench_data *data, int length) {
   memset(data->object_contents, 'z', length);
 }
@@ -167,15 +153,18 @@ void *ObjBencher::status_printer(void *_bencher) {
 int ObjBencher::aio_bench(
   int operation, int secondsToRun,
   int maxObjectsToCreate,
-  int concurrentios, int op_size, bool cleanup) {
+  int concurrentios, int op_size, bool cleanup, const char* run_name) {
   int object_size = op_size;
   int num_objects = 0;
   int r = 0;
   int prevPid = 0;
 
+  // default metadata object is used if user does not specify one
+  const std::string run_name_meta = (run_name == NULL ? BENCH_LASTRUN_METADATA : std::string(run_name));
+
   //get data from previous write run, if available
   if (operation != OP_WRITE) {
-    r = fetch_bench_metadata(BENCH_LASTRUN_METADATA, &object_size, &num_objects, &prevPid);
+    r = fetch_bench_metadata(run_name_meta, &object_size, &num_objects, &prevPid);
     if (r < 0) {
       if (r == -ENOENT)
 	cerr << "Must write data before running a read benchmark!" << std::endl;
@@ -205,7 +194,7 @@ int ObjBencher::aio_bench(
   sanitize_object_contents(&data, data.object_size);
 
   if (OP_WRITE == operation) {
-    r = write_bench(secondsToRun, maxObjectsToCreate, concurrentios);
+    r = write_bench(secondsToRun, maxObjectsToCreate, concurrentios, run_name_meta);
     if (r != 0) goto out;
   }
   else if (OP_SEQ_READ == operation) {
@@ -218,7 +207,7 @@ int ObjBencher::aio_bench(
   }
 
   if (OP_WRITE == operation && cleanup) {
-    r = fetch_bench_metadata(BENCH_LASTRUN_METADATA, &object_size, &num_objects, &prevPid);
+    r = fetch_bench_metadata(run_name_meta, &object_size, &num_objects, &prevPid);
     if (r < 0) {
       if (r == -ENOENT)
 	cerr << "Should never happen: bench metadata missing for current run!" << std::endl;
@@ -229,11 +218,8 @@ int ObjBencher::aio_bench(
     if (r != 0) goto out;
 
     // lastrun file
-    r = sync_remove(BENCH_LASTRUN_METADATA);
+    r = sync_remove(run_name_meta);
     if (r != 0) goto out;
-
-    // prefix-based file
-    r = sync_remove(generate_metadata_name());
   }
 
  out:
@@ -299,7 +285,7 @@ int ObjBencher::fetch_bench_metadata(const std::string& metadata_file, int* obje
 }
 
 int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
-			    int concurrentios) {
+			    int concurrentios, const string& run_name_meta) {
   if (maxObjectsToCreate > 0 && concurrentios > maxObjectsToCreate)
     concurrentios = maxObjectsToCreate;
   out(cout) << "Maintaining " << concurrentios << " concurrent writes of "
@@ -480,11 +466,8 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
   ::encode(data.finished, b_write);
   ::encode(getpid(), b_write);
 
-  // lastrun file
-  sync_write(BENCH_LASTRUN_METADATA, b_write, sizeof(int)*3);
-
-  // PID-specific run
-  sync_write(generate_metadata_name(), b_write, sizeof(int)*3);
+  // persist meta-data for further cleanup or read
+  sync_write(run_name_meta, b_write, sizeof(int)*3);
 
   completions_done();
 
@@ -584,7 +567,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     completion_wait(slot);
     lock.Lock();
     r = completion_ret(slot);
-    if (r != 0) {
+    if (r < 0) {
       cerr << "read got " << r << std::endl;
       lock.Unlock();
       goto ERR;
@@ -627,7 +610,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     completion_wait(slot);
     lock.Lock();
     r = completion_ret(slot);
-    if (r != 0) {
+    if (r < 0) {
       cerr << "read got " << r << std::endl;
       lock.Unlock();
       goto ERR;
@@ -771,7 +754,7 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
     completion_wait(slot);
     lock.Lock();
     r = completion_ret(slot);
-    if (r != 0) {
+    if (r < 0) {
       cerr << "read got " << r << std::endl;
       lock.Unlock();
       goto ERR;
@@ -814,7 +797,7 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
     completion_wait(slot);
     lock.Lock();
     r = completion_ret(slot);
-    if (r != 0) {
+    if (r < 0) {
       cerr << "read got " << r << std::endl;
       lock.Unlock();
       goto ERR;
@@ -869,19 +852,19 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
   return -5;
 }
 
-int ObjBencher::clean_up(const std::string& prefix, int concurrentios) {
+int ObjBencher::clean_up(const char* prefix, int concurrentios, const char* run_name) {
   int r = 0;
   int object_size;
   int num_objects;
   int prevPid;
 
-  std::string metadata_name = prefix;
-  metadata_name.append("_metadata");
+  // default meta object if user does not specify one
+  const std::string run_name_meta = (run_name == NULL ? BENCH_LASTRUN_METADATA : std::string(run_name));
 
-  r = fetch_bench_metadata(metadata_name, &object_size, &num_objects, &prevPid);
+  r = fetch_bench_metadata(run_name_meta, &object_size, &num_objects, &prevPid);
   if (r < 0) {
     // if the metadata file is not found we should try to do a linear search on the prefix
-    if (r == -ENOENT) {
+    if (r == -ENOENT && prefix != NULL) {
       return clean_up_slow(prefix, concurrentios);
     }
     else {
@@ -892,7 +875,7 @@ int ObjBencher::clean_up(const std::string& prefix, int concurrentios) {
   r = clean_up(num_objects, prevPid, concurrentios);
   if (r != 0) return r;
 
-  r = sync_remove(metadata_name);
+  r = sync_remove(run_name_meta);
   if (r != 0) return r;
 
   return 0;
diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h
index b87821a..216e265 100644
--- a/src/common/obj_bencher.h
+++ b/src/common/obj_bencher.h
@@ -63,7 +63,7 @@ protected:
 
   int fetch_bench_metadata(const std::string& metadata_file, int* object_size, int* num_objects, int* prevPid);
 
-  int write_bench(int secondsToRun, int maxObjects, int concurrentios);
+  int write_bench(int secondsToRun, int maxObjects, int concurrentios, const string& run_name_meta);
   int seq_read_bench(int secondsToRun, int concurrentios, int num_objects, int writePid);
   int rand_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid);
 
@@ -97,8 +97,8 @@ public:
   virtual ~ObjBencher() {}
   int aio_bench(
     int operation, int secondsToRun, int maxObjectsToCreate,
-    int concurrentios, int op_size, bool cleanup);
-  int clean_up(const std::string& prefix, int concurrentios);
+    int concurrentios, int op_size, bool cleanup, const char* run_name);
+  int clean_up(const char* prefix, int concurrentios, const char* run_name);
 
   void set_show_time(bool dt) {
     show_time = dt;
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
index d75dddf..b52a55a 100644
--- a/src/crush/CrushCompiler.cc
+++ b/src/crush/CrushCompiler.cc
@@ -14,6 +14,7 @@
 #include <cctype>
 
 #include <typeinfo>
+#include "common/errno.h"
 
 // -------------
 
@@ -374,7 +375,7 @@ int CrushCompiler::parse_tunable(iter_t const& i)
 
   /*
 
-    current crop of tunables are all now "safe".  reenable this when we
+    current crop of tunables are all now "safe".  re-enable this when we
     add new ones that are ... new.
 
   if (!unsafe_tunables) {
@@ -563,7 +564,7 @@ int CrushCompiler::parse_bucket(iter_t const& i)
     if (r == -EEXIST)
       err << "Duplicate bucket id " << id << std::endl;
     else
-      err << "add_bucket failed " << strerror(-r) << std::endl;
+      err << "add_bucket failed " << cpp_strerror(r) << std::endl;
     return r;
   }
   r = crush.set_item_name(id, name.c_str());
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 61b8a8a..b5ecbc6 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -1,6 +1,7 @@
 
 #include "common/debug.h"
 #include "common/Formatter.h"
+#include "common/errno.h"
 
 #include "CrushWrapper.h"
 
@@ -453,8 +454,7 @@ int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string n
       int empty = 0, newid;
       int r = add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
       if (r < 0) {
-        char buf[128]; 
-        ldout(cct, 1) << "add_bucket failure error: " << strerror_r(-r, buf, sizeof(buf)) << dendl;
+        ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r) << dendl;
         return r;
       }
       set_item_name(newid, q->second);
diff --git a/src/erasure-code/ErasureCodePlugin.cc b/src/erasure-code/ErasureCodePlugin.cc
index 6349441..da075d2 100644
--- a/src/erasure-code/ErasureCodePlugin.cc
+++ b/src/erasure-code/ErasureCodePlugin.cc
@@ -18,6 +18,7 @@
 #include <dlfcn.h>
 
 #include "ErasureCodePlugin.h"
+#include "common/errno.h"
 
 #define PLUGIN_PREFIX "libec_"
 #define PLUGIN_SUFFIX ".so"
@@ -107,7 +108,7 @@ int ErasureCodePluginRegistry::load(const std::string &plugin_name,
     int r = erasure_code_init(name.c_str());
     if (r != 0) {
       ss << "erasure_code_init(" << plugin_name
-	 << "): " << strerror(-r);
+	 << "): " << cpp_strerror(r);
       dlclose(library);
       return r;
     }
diff --git a/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc b/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc
index b5da3c0..3d5d3e6 100644
--- a/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc
+++ b/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc
@@ -63,8 +63,27 @@ public:
   }
 };
 
+extern "C" {
+#include "galois.h"
+
+extern gf_t *gfp_array[];
+extern int  gfp_is_composite[];
+}
+
 int __erasure_code_init(char *plugin_name)
 {
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  int w[] = { 4, 8, 16, 32 };
+  for(int i = 0; i < 4; i++) {
+    if (gfp_array[w[i]] == NULL) {
+      gfp_array[w[i]] = (gf_t*)malloc(sizeof(gf_t));
+      assert(gfp_array[w[i]]);
+      gfp_is_composite[w[i]] = 0;
+      if (!gf_init_easy(gfp_array[w[i]], w[i])) {
+	derr << "failed to gf_init_easy(" << w[i] << ")" << dendl;
+	return -EINVAL;
+      }
+    }
+  }
   return instance.add(plugin_name, new ErasureCodePluginJerasure());
 }
diff --git a/src/include/encoding.h b/src/include/encoding.h
index d097482..434d158 100644
--- a/src/include/encoding.h
+++ b/src/include/encoding.h
@@ -303,7 +303,7 @@ inline void encode(const boost::optional<T> &p, bufferlist &bl)
   __u8 present = static_cast<bool>(p);
   ::encode(present, bl);
   if (p)
-    ::encode(p.get(), bl);
+    encode(p.get(), bl);
 }
 
 template<typename T>
@@ -314,7 +314,7 @@ inline void decode(boost::optional<T> &p, bufferlist::iterator &bp)
   if (present) {
     T t;
     p = t;
-    ::decode(p.get(), bp);
+    decode(p.get(), bp);
   }
 }
 
diff --git a/src/include/memory.h b/src/include/memory.h
index ee28d69..596627c 100644
--- a/src/include/memory.h
+++ b/src/include/memory.h
@@ -10,6 +10,7 @@
 namespace ceph {
   using std::shared_ptr;
   using std::weak_ptr;
+  using std::static_pointer_cast;
 }
 
 #else
@@ -19,6 +20,7 @@ namespace ceph {
 namespace ceph {
   using std::tr1::shared_ptr;
   using std::tr1::weak_ptr;
+  using std::tr1::static_pointer_cast;
 }
 
 #endif
diff --git a/src/include/rados.h b/src/include/rados.h
index 49391d9..0d02b24 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -372,6 +372,8 @@ enum {
 	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
 	CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000,  /* ignore pool overlay */
 	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+	CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000,  /* map snap direct to clone id
+						 */
 };
 
 enum {
@@ -401,6 +403,8 @@ enum {
 	CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1,     /* part of a flush operation */
 	CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2,  /* ignore pool overlay */
 	CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+	CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+						     * cloneid */
 };
 
 enum {
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index 0a3ef2b..3e68292 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -915,6 +915,20 @@ int rados_ioctx_snap_remove(rados_ioctx_t io, const char *snapname);
  * @param snapname which snapshot to rollback to
  * @returns 0 on success, negative error code on failure
  */
+int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
+		   const char *snapname);
+
+/**
+ * Rollback an object to a pool snapshot *DEPRECATED*
+ *
+ * Deprecated interface which is not rados_ioctx_snap_rollback()
+ * This function could go away in the future
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapname which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
 int rados_rollback(rados_ioctx_t io, const char *oid,
 		   const char *snapname);
 
@@ -1064,8 +1078,7 @@ uint64_t rados_get_last_version(rados_ioctx_t io);
  * @param buf data to write
  * @param len length of the data, in bytes
  * @param off byte offset in the object to begin writing at
- * @returns number of bytes written on success, negative error code on
- * failure
+ * @returns 0 on success, negative error code on failure
  */
 int rados_write(rados_ioctx_t io, const char *oid, const char *buf, size_t len, uint64_t off);
 
@@ -1111,8 +1124,7 @@ int rados_clone_range(rados_ioctx_t io, const char *dst, uint64_t dst_off,
  * @param oid the name of the object
  * @param buf the data to append
  * @param len length of buf (in bytes)
- * @returns number of bytes written on success, negative error code on
- * failure
+ * @returns 0 on success, negative error code on failure
  */
 int rados_append(rados_ioctx_t io, const char *oid, const char *buf, size_t len);
 
@@ -2290,7 +2302,7 @@ void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
 					 int *prval);
 
 /**
- * Perform a write operation synchronously
+ * Perform a read operation synchronously
  * @param read_op operation to perform
  * @io the ioctx that the object is in
  * @oid the object id
@@ -2302,7 +2314,7 @@ int rados_read_op_operate(rados_read_op_t read_op,
 			  int flags);
 
 /**
- * Perform a write operation asynchronously
+ * Perform a read operation asynchronously
  * @param read_op operation to perform
  * @io the ioctx that the object is in
  * @param completion what to do when operation has been attempted
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index dde6273..e7aeb4f 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -267,6 +267,15 @@ namespace librados
     void selfmanaged_snap_rollback(uint64_t snapid);
 
     /**
+     * Rollback an object to the specified snapshot id
+     *
+     * Used with pool snapshots
+     *
+     * @param snapid [in] snopshot id specified
+     */
+    void snap_rollback(uint64_t snapid);
+
+    /**
      * set keys and values according to map
      *
      * @param map [in] keys and values to set
@@ -614,6 +623,8 @@ namespace librados
 
     int snap_list(std::vector<snap_t> *snaps);
 
+    int snap_rollback(const std::string& oid, const char *snapname);
+    // Deprecated name kept for backward compatibility - same as snap_rollback()
     int rollback(const std::string& oid, const char *snapname);
 
     int selfmanaged_snap_create(uint64_t *snapid);
diff --git a/src/include/rados/memory.h b/src/include/rados/memory.h
index ee28d69..596627c 100644
--- a/src/include/rados/memory.h
+++ b/src/include/rados/memory.h
@@ -10,6 +10,7 @@
 namespace ceph {
   using std::shared_ptr;
   using std::weak_ptr;
+  using std::static_pointer_cast;
 }
 
 #else
@@ -19,6 +20,7 @@ namespace ceph {
 namespace ceph {
   using std::tr1::shared_ptr;
   using std::tr1::weak_ptr;
+  using std::tr1::static_pointer_cast;
 }
 
 #endif
diff --git a/src/include/utime.h b/src/include/utime.h
index 5bebc70..3108ecd 100644
--- a/src/include/utime.h
+++ b/src/include/utime.h
@@ -36,7 +36,7 @@ public:
   friend class Clock;
  
  public:
-  bool is_zero() {
+  bool is_zero() const {
     return (tv.tv_sec == 0) && (tv.tv_nsec == 0);
   }
   void normalize() {
@@ -130,14 +130,14 @@ public:
   operator double() const {
     return (double)sec() + ((double)nsec() / 1000000000.0L);
   }
-  operator ceph_timespec() {
+  operator ceph_timespec() const {
     ceph_timespec ts;
     ts.tv_sec = sec();
     ts.tv_nsec = nsec();
     return ts;
   }
 
-  void sleep() {
+  void sleep() const {
     struct timespec ts;
     to_timespec(&ts);
     nanosleep(&ts, NULL);
diff --git a/src/librados/AioCompletionImpl.h b/src/librados/AioCompletionImpl.h
index 521f3bc..fd8ea48 100644
--- a/src/librados/AioCompletionImpl.h
+++ b/src/librados/AioCompletionImpl.h
@@ -40,7 +40,7 @@ struct librados::AioCompletionImpl {
   // for read
   bool is_read;
   bufferlist bl;
-  unsigned maxlen;
+  bufferlist *blp;
 
   IoCtxImpl *io;
   ceph_tid_t aio_write_seq;
@@ -53,7 +53,7 @@ struct librados::AioCompletionImpl {
 			callback_safe(0),
 			callback_complete_arg(0),
 			callback_safe_arg(0),
-			is_read(false), maxlen(0),
+			is_read(false), blp(NULL),
 			io(NULL), aio_write_seq(0), aio_write_list_item(this) { }
 
   int set_complete_callback(void *cb_arg, rados_callback_t cb) {
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 4f5387a..6fc22ad 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -460,11 +460,7 @@ int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,
   bufferlist mybl;
   mybl.substr_of(bl, 0, len);
   op.write(off, mybl);
-  int r =  operate(oid, &op, NULL);
-  if (r < 0)
-    return r;
-
-  return len;
+  return operate(oid, &op, NULL);
 }
 
 int librados::IoCtxImpl::append(const object_t& oid, bufferlist& bl, size_t len)
@@ -474,11 +470,7 @@ int librados::IoCtxImpl::append(const object_t& oid, bufferlist& bl, size_t len)
   bufferlist mybl;
   mybl.substr_of(bl, 0, len);
   op.append(mybl);
-  int r = operate(oid, &op, NULL);
-  if (r < 0)
-    return r;
-
-  return len;
+  return operate(oid, &op, NULL);
 }
 
 int librados::IoCtxImpl::write_full(const object_t& oid, bufferlist& bl)
@@ -636,6 +628,7 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
 
   c->is_read = true;
   c->io = this;
+  c->blp = pbl;
 
   Mutex::Locker l(*lock);
   objecter->read(oid, oloc,
@@ -655,9 +648,9 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
 
   c->is_read = true;
   c->io = this;
-  c->maxlen = len;
   c->bl.clear();
   c->bl.push_back(buffer::create_static(len, buf));
+  c->blp = &c->bl;
 
   Mutex::Locker l(*lock);
   objecter->read(oid, oloc,
@@ -1250,8 +1243,8 @@ void librados::IoCtxImpl::C_aio_Ack::finish(int r)
     c->safe = true;
   c->cond.Signal();
 
-  if (c->bl.length() > 0) {
-    c->rval = c->bl.length();
+  if (r == 0 && c->blp && c->blp->length() > 0) {
+    c->rval = c->blp->length();
   }
 
   if (c->callback_complete) {
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 80f2c63..2358fb4 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -453,6 +453,13 @@ void librados::ObjectWriteOperation::selfmanaged_snap_rollback(snap_t snapid)
   o->rollback(snapid);
 }
 
+// You must specify the snapid not the name normally used with pool snapshots
+void librados::ObjectWriteOperation::snap_rollback(snap_t snapid)
+{
+  ::ObjectOperation *o = (::ObjectOperation *)impl;
+  o->rollback(snapid);
+}
+
 void librados::ObjectWriteOperation::set_alloc_hint(
                                             uint64_t expected_object_size,
                                             uint64_t expected_write_size)
@@ -1144,11 +1151,17 @@ int librados::IoCtx::snap_list(std::vector<snap_t> *snaps)
   return io_ctx_impl->snap_list(snaps);
 }
 
-int librados::IoCtx::rollback(const std::string& oid, const char *snapname)
+int librados::IoCtx::snap_rollback(const std::string& oid, const char *snapname)
 {
   return io_ctx_impl->rollback(oid, snapname);
 }
 
+// Deprecated name kept for backward compatibility
+int librados::IoCtx::rollback(const std::string& oid, const char *snapname)
+{
+  return snap_rollback(oid, snapname);
+}
+
 int librados::IoCtx::selfmanaged_snap_create(uint64_t *snapid)
 {
   return io_ctx_impl->selfmanaged_snap_create(snapid);
@@ -2035,7 +2048,7 @@ extern "C" int rados_pool_list(rados_t cluster, char *buf, size_t len)
   if (r < 0)
     return r;
 
-  if (!buf)
+  if (len > 0 && !buf)
     return -EINVAL;
 
   char *b = buf;
@@ -2492,13 +2505,20 @@ extern "C" int rados_ioctx_snap_remove(rados_ioctx_t io, const char *snapname)
   return ctx->snap_remove(snapname);
 }
 
-extern "C" int rados_rollback(rados_ioctx_t io, const char *oid,
+extern "C" int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
 			      const char *snapname)
 {
   librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
   return ctx->rollback(oid, snapname);
 }
 
+// Deprecated name kept for backward compatibility
+extern "C" int rados_rollback(rados_ioctx_t io, const char *oid,
+			      const char *snapname)
+{
+  return rados_ioctx_snap_rollback(io, oid, snapname);
+}
+
 extern "C" int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
 					     uint64_t *snapid)
 {
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 8056fab..127be38 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -2884,9 +2884,6 @@ reprotect_and_return_err:
     ldout(cct, 20) << "aio_write " << ictx << " off = " << off << " len = "
 		   << len << " buf = " << (void*)buf << dendl;
 
-    if (!len)
-      return 0;
-
     int r = ictx_check(ictx);
     if (r < 0)
       return r;
@@ -2912,14 +2909,16 @@ reprotect_and_return_err:
 
     // map
     vector<ObjectExtent> extents;
-    Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout, off, mylen, 0, extents);
+    if (len > 0) {
+      Striper::file_to_extents(ictx->cct, ictx->format_string,
+			       &ictx->layout, off, mylen, 0, extents);
+    }
 
     c->get();
     c->init_time(ictx, AIO_TYPE_WRITE);
     for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
       ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
 		     << " from " << p->buffer_extents << dendl;
-
       // assemble extent
       bufferlist bl;
       for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
@@ -2966,9 +2965,6 @@ reprotect_and_return_err:
     ldout(cct, 20) << "aio_discard " << ictx << " off = " << off << " len = "
 		   << len << dendl;
 
-    if (!len)
-      return 0;
-
     int r = ictx_check(ictx);
     if (r < 0)
       return r;
@@ -2992,7 +2988,10 @@ reprotect_and_return_err:
 
     // map
     vector<ObjectExtent> extents;
-    Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout, off, len, 0, extents);
+    if (len > 0) {
+      Striper::file_to_extents(ictx->cct, ictx->format_string,
+			       &ictx->layout, off, len, 0, extents);
+    }
 
     c->get();
     c->init_time(ictx, AIO_TYPE_DISCARD);
@@ -3086,6 +3085,8 @@ reprotect_and_return_err:
       r = clip_io(ictx, p->first, &len);
       if (r < 0)
 	return r;
+      if (len == 0)
+	continue;
 
       Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout,
 			       p->first, len, 0, object_extents, buffer_ofs);
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 39a4c55..41ed83d 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -242,11 +242,11 @@ public:
     return get_projected_linkage()->inode;
   }
 
-  bool use_projected(client_t client, Mutation *mut) {
+  bool use_projected(client_t client, const MutationRef& mut) const {
     return lock.can_read_projected(client) || 
       lock.get_xlock_by() == mut;
   }
-  linkage_t *get_linkage(client_t client, Mutation *mut) {
+  linkage_t *get_linkage(client_t client, const MutationRef& mut) {
     return use_projected(client, mut) ? get_projected_linkage() : get_linkage();
   }
 
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 35573b6..a423ef7 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1050,7 +1050,7 @@ void CDir::assimilate_dirty_rstat_inodes()
   dout(10) << "assimilate_dirty_rstat_inodes done" << dendl;
 }
 
-void CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob)
+void CDir::assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob)
 {
   if (!state_test(STATE_ASSIMRSTAT))
     return;
@@ -1775,7 +1775,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
  * @param want - min version i want committed
  * @param c - callback for completion
  */
-void CDir::commit(version_t want, Context *c, bool ignore_authpinnability)
+void CDir::commit(version_t want, Context *c, bool ignore_authpinnability, int op_prio)
 {
   dout(10) << "commit want " << want << " on " << *this << dendl;
   if (want == 0) want = get_version();
@@ -1797,7 +1797,7 @@ void CDir::commit(version_t want, Context *c, bool ignore_authpinnability)
   waiting_for_commit[want].push_back(c);
   
   // ok.
-  _commit(want);
+  _commit(want, op_prio);
 }
 
 class C_Dir_Committed : public Context {
@@ -1815,13 +1815,16 @@ public:
  * Flush out the modified dentries in this dir. Keep the bufferlist
  * below max_write_size;
  */
-void CDir::_omap_commit()
+void CDir::_omap_commit(int op_prio)
 {
   dout(10) << "_omap_commit" << dendl;
 
   unsigned max_write_size = cache->max_dir_commit_size;
   unsigned write_size = 0;
 
+  if (op_prio < 0)
+    op_prio = CEPH_MSG_PRIO_DEFAULT;
+
   // snap purge?
   const set<snapid_t> *snaps = NULL;
   SnapRealm *realm = inode->find_snaprealm();
@@ -1877,7 +1880,7 @@ void CDir::_omap_commit()
 
     if (write_size >= max_write_size) {
       ObjectOperation op;
-      op.priority = CEPH_MSG_PRIO_LOW; // set priority lower than journal!
+      op.priority = op_prio;
       op.tmap_to_omap(true); // convert tmap to omap
 
       if (!to_set.empty())
@@ -1895,7 +1898,7 @@ void CDir::_omap_commit()
   }
 
   ObjectOperation op;
-  op.priority = CEPH_MSG_PRIO_LOW; // set priority lower than journal!
+  op.priority = op_prio;
   op.tmap_to_omap(true); // convert tmap to omap
 
   /*
@@ -1968,7 +1971,7 @@ void CDir::_encode_dentry(CDentry *dn, bufferlist& bl,
   }
 }
 
-void CDir::_commit(version_t want)
+void CDir::_commit(version_t want, int op_prio)
 {
   dout(10) << "_commit want " << want << " on " << *this << dendl;
 
@@ -2008,7 +2011,7 @@ void CDir::_commit(version_t want)
   
   if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_c);
 
-   _omap_commit();
+   _omap_commit(op_prio);
 }
 
 
@@ -2090,7 +2093,7 @@ void CDir::_committed(version_t v)
     ++n;
     if (p->first > committed_version) {
       dout(10) << " there are waiters for " << p->first << ", committing again" << dendl;
-      _commit(p->first);
+      _commit(p->first, -1);
       break;
     }
     cache->mds->queue_waiters(p->second);
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index dc1037a..f5762e2 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -179,7 +179,7 @@ public:
   void resync_accounted_fragstat();
   void resync_accounted_rstat();
   void assimilate_dirty_rstat_inodes();
-  void assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob);
+  void assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob);
 
 protected:
   version_t projected_version;
@@ -498,14 +498,15 @@ protected:
 
   // -- commit --
   map<version_t, list<Context*> > waiting_for_commit;
-  void _commit(version_t want);
-  void _omap_commit();
+  void _commit(version_t want, int op_prio);
+  void _omap_commit(int op_prio);
   void _encode_dentry(CDentry *dn, bufferlist& bl, const set<snapid_t> *snaps);
   void _committed(version_t v);
 public:
   void wait_for_commit(Context *c, version_t v=0);
   void commit_to(version_t want);
-  void commit(version_t want, Context *c, bool ignore_authpinnability=false);
+  void commit(version_t want, Context *c,
+	      bool ignore_authpinnability=false, int op_prio=-1);
 
   // -- dirtyness --
   version_t get_committing_version() { return committing_version; }
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index e475dfb..53a0f44 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1238,6 +1238,8 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
       if (!is_dir()) {
 	::encode(inode.layout, bl);
 	::encode(inode.size, bl);
+	::encode(inode.truncate_seq, bl);
+	::encode(inode.truncate_size, bl);
 	::encode(inode.client_ranges, bl);
 	::encode(inode.inline_data, bl);
 	::encode(inode.inline_version, bl);
@@ -1439,6 +1441,8 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
       if (!is_dir()) {
 	::decode(inode.layout, p);
 	::decode(inode.size, p);
+	::decode(inode.truncate_seq, p);
+	::decode(inode.truncate_size, p);
 	::decode(inode.client_ranges, p);
 	::decode(inode.inline_data, p);
 	::decode(inode.inline_version, p);
@@ -1678,9 +1682,9 @@ void CInode::start_scatter(ScatterLock *lock)
 struct C_Inode_FragUpdate : public Context {
   CInode *in;
   CDir *dir;
-  Mutation *mut;
+  MutationRef mut;
 
-  C_Inode_FragUpdate(CInode *i, CDir *d, Mutation *m) : in(i), dir(d), mut(m) {}
+  C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
   void finish(int r) {
     in->_finish_frag_update(dir, mut);
   }    
@@ -1701,7 +1705,7 @@ void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
       dout(10) << "finish_scatter_update " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
 
       MDLog *mdlog = mdcache->mds->mdlog;
-      Mutation *mut = new Mutation;
+      MutationRef mut(new MutationImpl);
       mut->ls = mdlog->get_current_segment();
 
       inode_t *pi = get_projected_inode();
@@ -1742,12 +1746,11 @@ void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
   }
 }
 
-void CInode::_finish_frag_update(CDir *dir, Mutation *mut)
+void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
 {
   dout(10) << "_finish_frag_update on " << *dir << dendl;
   mut->apply();
   mut->cleanup();
-  delete mut;
 }
 
 
@@ -1952,7 +1955,7 @@ void CInode::finish_scatter_gather_update(int type)
   }
 }
 
-void CInode::finish_scatter_gather_update_accounted(int type, Mutation *mut, EMetaBlob *metablob)
+void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob)
 {
   dout(10) << "finish_scatter_gather_update_accounted " << type << " on " << *this << dendl;
   assert(is_auth());
@@ -2993,6 +2996,7 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session,
       issue = cap->pending();
       cap->set_last_issue();
       cap->set_last_issue_stamp(ceph_clock_now(g_ceph_context));
+      cap->clear_new();
       e.cap.caps = issue;
       e.cap.wanted = cap->wanted();
       e.cap.cap_id = cap->get_cap_id();
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index efc9825..cb1add3 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -663,7 +663,7 @@ public:
   void encode_lock_state(int type, bufferlist& bl);
   void decode_lock_state(int type, bufferlist& bl);
 
-  void _finish_frag_update(CDir *dir, Mutation *mut);
+  void _finish_frag_update(CDir *dir, MutationRef& mut);
 
   void clear_dirty_scattered(int type);
   bool is_dirty_scattered();
@@ -673,7 +673,7 @@ public:
   void finish_scatter_update(ScatterLock *lock, CDir *dir,
 			     version_t inode_version, version_t dir_accounted_version);
   void finish_scatter_gather_update(int type);
-  void finish_scatter_gather_update_accounted(int type, Mutation *mut, EMetaBlob *metablob);
+  void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob);
 
   // -- snap --
   void open_snaprealm(bool no_split=false);
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index c105a38..3d97648 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -137,15 +137,7 @@ private:
 
 public:
   int pending() { return _pending; }
-  int issued() {
-    if (0) {
-      //#warning capability debug sanity check, remove me someday
-      unsigned o = _issued;
-      _calc_issued();
-      assert(o == _issued);
-    }
-    return _issued;
-  }
+  int issued() { return _issued; }
   bool is_null() { return !_pending && _revokes.empty(); }
 
   ceph_seq_t issue(unsigned c) {
@@ -192,9 +184,14 @@ public:
       // can i forget any revocations?
       while (!_revokes.empty() && _revokes.front().seq < seq)
 	_revokes.pop_front();
-      if (!_revokes.empty() && _revokes.front().seq == seq)
-	_revokes.begin()->before = caps;
-      _calc_issued();
+      if (!_revokes.empty()) {
+	if (_revokes.front().seq == seq)
+	  _revokes.begin()->before = caps;
+	_calc_issued();
+      } else {
+	// seq < last_sent
+	_issued = caps | _pending;
+      }
     }
     //check_rdcaps_list();
   }
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 6a274bb..e5fe00c 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -169,7 +169,7 @@ void Locker::include_snap_rdlocks_wlayout(set<SimpleLock*>& rdlocks, CInode *in,
 
 /* If this function returns false, the mdr has been placed
  * on the appropriate wait list */
-bool Locker::acquire_locks(MDRequest *mdr,
+bool Locker::acquire_locks(MDRequestRef& mdr,
 			   set<SimpleLock*> &rdlocks,
 			   set<SimpleLock*> &wrlocks,
 			   set<SimpleLock*> &xlocks,
@@ -288,7 +288,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
     
     if (!object->is_auth()) {
       if (!mdr->locks.empty())
-	mds->locker->drop_locks(mdr);
+	mds->locker->drop_locks(mdr.get());
       if (object->is_ambiguous_auth()) {
 	// wait
 	dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
@@ -301,7 +301,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
     }
     if (!object->can_auth_pin()) {
       // wait
-      mds->locker->drop_locks(mdr);
+      mds->locker->drop_locks(mdr.get());
       mdr->drop_local_auth_pins();
       if (auth_pin_nonblock) {
 	dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl;
@@ -393,7 +393,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
 	    mdr->remote_wrlocks[have] != (*remote_wrlocks)[have]) {
 	  dout(10) << " unlocking remote_wrlock on wrong mds." << mdr->remote_wrlocks[have]
 		   << " " << *have << " " << *have->get_parent() << dendl;
-	  remote_wrlock_finish(have, mdr->remote_wrlocks[have], mdr);
+	  remote_wrlock_finish(have, mdr->remote_wrlocks[have], mdr.get());
 	}
       }
       if (need_wrlock || need_remote_wrlock) {
@@ -422,7 +422,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
 	else if (need_remote_wrlock) // acquire remote_wrlock first
 	  dout(10) << " unlocking out-of-order " << *lock << " " << *lock->get_parent() << dendl;
 	bool need_issue = false;
-	wrlock_finish(lock, mdr, &need_issue);
+	wrlock_finish(lock, mdr.get(), &need_issue);
 	if (need_issue)
 	  issue_set.insert(static_cast<CInode*>(lock->get_parent()));
       }
@@ -434,15 +434,15 @@ bool Locker::acquire_locks(MDRequest *mdr,
       dout(10) << " unlocking out-of-order " << *stray << " " << *stray->get_parent() << dendl;
       bool need_issue = false;
       if (mdr->xlocks.count(stray)) {
-	xlock_finish(stray, mdr, &need_issue);
+	xlock_finish(stray, mdr.get(), &need_issue);
       } else if (mdr->rdlocks.count(stray)) {
-	rdlock_finish(stray, mdr, &need_issue);
+	rdlock_finish(stray, mdr.get(), &need_issue);
       } else {
 	// may have acquired both wrlock and remore wrlock
 	if (mdr->wrlocks.count(stray))
-	  wrlock_finish(stray, mdr, &need_issue);
+	  wrlock_finish(stray, mdr.get(), &need_issue);
 	if (mdr->remote_wrlocks.count(stray))
-	  remote_wrlock_finish(stray, mdr->remote_wrlocks[stray], mdr);
+	  remote_wrlock_finish(stray, mdr->remote_wrlocks[stray], mdr.get());
       }
       if (need_issue)
 	issue_set.insert(static_cast<CInode*>(stray->get_parent()));
@@ -450,7 +450,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
 
     // lock
     if (mdr->locking && *p != mdr->locking) {
-      cancel_locking(mdr, &issue_set);
+      cancel_locking(mdr.get(), &issue_set);
     }
     if (xlocks.count(*p)) {
       if (!xlock_start(*p, mdr)) 
@@ -465,7 +465,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
 	if (need_remote_wrlock && !(*p)->can_wrlock(mdr->get_client())) {
 	  // can't take the wrlock because the scatter lock is gathering. need to
 	  // release the remote wrlock, so that the gathering process can finish.
-	  remote_wrlock_finish(*p, mdr->remote_wrlocks[*p], mdr);
+	  remote_wrlock_finish(*p, mdr->remote_wrlocks[*p], mdr.get());
 	  remote_wrlock_start(*p, (*remote_wrlocks)[*p], mdr);
 	  goto out;
 	}
@@ -487,15 +487,15 @@ bool Locker::acquire_locks(MDRequest *mdr,
     dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << dendl;
     bool need_issue = false;
     if (mdr->xlocks.count(stray)) {
-      xlock_finish(stray, mdr, &need_issue);
+      xlock_finish(stray, mdr.get(), &need_issue);
     } else if (mdr->rdlocks.count(stray)) {
-      rdlock_finish(stray, mdr, &need_issue);
+      rdlock_finish(stray, mdr.get(), &need_issue);
     } else {
       // may have acquired both wrlock and remore wrlock
       if (mdr->wrlocks.count(stray))
-	wrlock_finish(stray, mdr, &need_issue);
+	wrlock_finish(stray, mdr.get(), &need_issue);
       if (mdr->remote_wrlocks.count(stray))
-	remote_wrlock_finish(stray, mdr->remote_wrlocks[stray], mdr);
+	remote_wrlock_finish(stray, mdr->remote_wrlocks[stray], mdr.get());
     }
     if (need_issue)
       issue_set.insert(static_cast<CInode*>(stray->get_parent()));
@@ -510,7 +510,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
 }
 
 
-void Locker::set_xlocks_done(Mutation *mut, bool skip_dentry)
+void Locker::set_xlocks_done(MutationImpl *mut, bool skip_dentry)
 {
   for (set<SimpleLock*>::iterator p = mut->xlocks.begin();
        p != mut->xlocks.end();
@@ -523,7 +523,7 @@ void Locker::set_xlocks_done(Mutation *mut, bool skip_dentry)
   }
 }
 
-void Locker::_drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
+void Locker::_drop_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
 {
   while (!mut->rdlocks.empty()) {
     bool ni = false;
@@ -534,7 +534,7 @@ void Locker::_drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
   }
 }
 
-void Locker::_drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
+void Locker::_drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
 {
   set<int> slaves;
 
@@ -581,7 +581,7 @@ void Locker::_drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
   }
 }
 
-void Locker::cancel_locking(Mutation *mut, set<CInode*> *pneed_issue)
+void Locker::cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue)
 {
   SimpleLock *lock = mut->locking;
   assert(lock);
@@ -602,7 +602,7 @@ void Locker::cancel_locking(Mutation *mut, set<CInode*> *pneed_issue)
   mut->finish_locking(lock);
 }
 
-void Locker::drop_locks(Mutation *mut, set<CInode*> *pneed_issue)
+void Locker::drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue)
 {
   // leftover locks
   set<CInode*> my_need_issue;
@@ -619,7 +619,7 @@ void Locker::drop_locks(Mutation *mut, set<CInode*> *pneed_issue)
   mut->done_locking = false;
 }
 
-void Locker::drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
+void Locker::drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
 {
   set<CInode*> my_need_issue;
   if (!pneed_issue)
@@ -631,7 +631,7 @@ void Locker::drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
     issue_caps_set(*pneed_issue);
 }
 
-void Locker::drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue)
+void Locker::drop_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
 {
   set<CInode*> my_need_issue;
   if (!pneed_issue)
@@ -1149,7 +1149,7 @@ bool Locker::rdlock_try(SimpleLock *lock, client_t client, Context *con)
   return false;
 }
 
-bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mut, bool as_anon)
+bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon)
 {
   dout(7) << "rdlock_start  on " << *lock << " on " << *lock->get_parent() << dendl;  
 
@@ -1218,7 +1218,7 @@ void Locker::nudge_log(SimpleLock *lock)
     mds->mdlog->flush();
 }
 
-void Locker::rdlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue)
+void Locker::rdlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue)
 {
   // drop ref
   lock->put_rdlock();
@@ -1261,7 +1261,7 @@ bool Locker::rdlock_try_set(set<SimpleLock*>& locks)
   return true;
 }
 
-void Locker::rdlock_take_set(set<SimpleLock*>& locks, Mutation *mut)
+void Locker::rdlock_take_set(set<SimpleLock*>& locks, MutationRef& mut)
 {
   dout(10) << "rdlock_take_set " << locks << dendl;
   for (set<SimpleLock*>::iterator p = locks.begin(); p != locks.end(); ++p) {
@@ -1274,7 +1274,7 @@ void Locker::rdlock_take_set(set<SimpleLock*>& locks, Mutation *mut)
 // ------------------
 // wrlock
 
-void Locker::wrlock_force(SimpleLock *lock, Mutation *mut)
+void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut)
 {
   if (lock->get_type() == CEPH_LOCK_IVERSION ||
       lock->get_type() == CEPH_LOCK_DVERSION)
@@ -1287,7 +1287,7 @@ void Locker::wrlock_force(SimpleLock *lock, Mutation *mut)
   mut->locks.insert(lock);
 }
 
-bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait)
+bool Locker::wrlock_start(SimpleLock *lock, MDRequestRef& mut, bool nowait)
 {
   if (lock->get_type() == CEPH_LOCK_IVERSION ||
       lock->get_type() == CEPH_LOCK_DVERSION)
@@ -1351,7 +1351,7 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait)
   return false;
 }
 
-void Locker::wrlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue)
+void Locker::wrlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue)
 {
   if (lock->get_type() == CEPH_LOCK_IVERSION ||
       lock->get_type() == CEPH_LOCK_DVERSION)
@@ -1376,7 +1376,7 @@ void Locker::wrlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue)
 
 // remote wrlock
 
-void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
+void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequestRef& mut)
 {
   dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl;
 
@@ -1401,7 +1401,8 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
   mut->more()->waiting_on_slave.insert(target);
 }
 
-void Locker::remote_wrlock_finish(SimpleLock *lock, int target, Mutation *mut)
+void Locker::remote_wrlock_finish(SimpleLock *lock, int target,
+                                  MutationImpl *mut)
 {
   // drop ref
   mut->remote_wrlocks.erase(lock);
@@ -1423,7 +1424,7 @@ void Locker::remote_wrlock_finish(SimpleLock *lock, int target, Mutation *mut)
 // ------------------
 // xlock
 
-bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut)
+bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
 {
   if (lock->get_type() == CEPH_LOCK_IVERSION ||
       lock->get_type() == CEPH_LOCK_DVERSION)
@@ -1522,7 +1523,7 @@ void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue
   eval_gather(lock, true, pneed_issue);
 }
 
-void Locker::xlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue)
+void Locker::xlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue)
 {
   if (lock->get_type() == CEPH_LOCK_IVERSION ||
       lock->get_type() == CEPH_LOCK_DVERSION)
@@ -1577,7 +1578,7 @@ void Locker::xlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue)
   }
 }
 
-void Locker::xlock_export(SimpleLock *lock, Mutation *mut)
+void Locker::xlock_export(SimpleLock *lock, MutationImpl *mut)
 {
   dout(10) << "xlock_export on " << *lock << " " << *lock->get_parent() << dendl;
 
@@ -1594,7 +1595,7 @@ void Locker::xlock_export(SimpleLock *lock, Mutation *mut)
   lock->set_state(LOCK_LOCK);
 }
 
-void Locker::xlock_import(SimpleLock *lock, Mutation *mut)
+void Locker::xlock_import(SimpleLock *lock)
 {
   dout(10) << "xlock_import on " << *lock << " " << *lock->get_parent() << dendl;
   lock->get_parent()->auth_pin(lock);
@@ -1613,12 +1614,13 @@ version_t Locker::issue_file_data_version(CInode *in)
 struct C_Locker_FileUpdate_finish : public Context {
   Locker *locker;
   CInode *in;
-  Mutation *mut;
+  MutationRef mut;
   bool share;
   client_t client;
   Capability *cap;
   MClientCaps *ack;
-  C_Locker_FileUpdate_finish(Locker *l, CInode *i, Mutation *m, bool e=false, client_t c=-1,
+  C_Locker_FileUpdate_finish(Locker *l, CInode *i, MutationRef& m,
+                             bool e=false, client_t c=-1,
 			     Capability *cp = 0,
 			     MClientCaps *ac = 0) : 
     locker(l), in(i), mut(m), share(e), client(c), cap(cp),
@@ -1630,7 +1632,7 @@ struct C_Locker_FileUpdate_finish : public Context {
   }
 };
 
-void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t client, 
+void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share, client_t client,
 				Capability *cap, MClientCaps *ack)
 {
   dout(10) << "file_update_finish on " << *in << dendl;
@@ -1643,7 +1645,7 @@ void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t
     mds->send_message_client_counted(ack, client);
 
   set<CInode*> need_issue;
-  drop_locks(mut, &need_issue);
+  drop_locks(mut.get(), &need_issue);
 
   if (!in->is_head() && !in->client_snap_caps.empty()) {
     dout(10) << " client_snap_caps " << in->client_snap_caps << dendl;
@@ -1672,14 +1674,14 @@ void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t
       issue_caps(in, cap);
     }
   
-    if (share && in->is_auth() && in->filelock.is_stable())
+    if (share && in->is_auth() &&
+	(in->filelock.gcaps_allowed(CAP_LONER) & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))
       share_inode_max_size(in);
   }
   issue_caps_set(need_issue);
 
   // auth unpin after issuing caps
   mut->cleanup();
-  delete mut;
 }
 
 Capability* Locker::issue_new_caps(CInode *in,
@@ -1708,6 +1710,7 @@ Capability* Locker::issue_new_caps(CInode *in,
     // new cap
     cap = in->add_client_cap(my_client, session, realm);
     cap->set_wanted(my_want);
+    cap->mark_new();
     cap->inc_suppress(); // suppress file cap messages for new cap (we'll bundle with the open() reply)
     is_new = true;
   } else {
@@ -1841,7 +1844,11 @@ bool Locker::issue_caps(CInode *in, Capability *only_cap)
 	seq = cap->issue((wanted|likes) & allowed);
       int after = cap->pending();
 
-      if (seq > 0) {
+      if (cap->is_new()) {
+	// haven't send caps to client yet
+	if (before & ~after)
+	  cap->confirm_receipt(seq, after);
+      } else {
         dout(7) << "   sending MClientCaps to client." << it->first
 		<< " seq " << cap->get_last_seq()
 		<< " new pending " << ccap_string(after) << " was " << ccap_string(before) 
@@ -2065,7 +2072,6 @@ void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,clie
     if ((p->second->issued() | p->second->wanted()) & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) {
       client_writeable_range_t& nr = new_ranges[p->first];
       nr.range.first = 0;
-      nr.follows = latest->client_ranges[p->first].follows;
       if (latest->client_ranges.count(p->first)) {
 	client_writeable_range_t& oldr = latest->client_ranges[p->first];
 	nr.range.last = MAX(ms, oldr.range.last);
@@ -2143,7 +2149,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
     }
   }
 
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
     
   inode_t *pi = in->project_inode();
@@ -2186,7 +2192,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
     metablob->add_primary_dentry(parent, in, true);
   } else {
     metablob->add_dir_context(in->get_projected_parent_dn()->get_dir());
-    mdcache->journal_dirty_inode(mut, metablob, in);
+    mdcache->journal_dirty_inode(mut.get(), metablob, in);
   }
   mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, true));
   wrlock_force(&in->filelock, mut);  // wrlock for duration of journal
@@ -2538,11 +2544,12 @@ public:
     locker(l), client(c), item(it) { }
   void finish(int r) {
     string dname;
-    locker->process_request_cap_release(NULL, client, item, dname);
+    MDRequestRef null_ref;
+    locker->process_request_cap_release(null_ref, client, item, dname);
   }
 };
 
-void Locker::process_request_cap_release(MDRequest *mdr, client_t client, const ceph_mds_request_release& item,
+void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& item,
 					 const string &dname)
 {
   inodeno_t ino = (uint64_t)item.ino;
@@ -2652,7 +2659,7 @@ void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq)
   issue_caps(in, cap);
 }
 
-void Locker::kick_cap_releases(MDRequest *mdr)
+void Locker::kick_cap_releases(MDRequestRef& mdr)
 {
   client_t client = mdr->get_client();
   for (map<vinodeno_t,ceph_seq_t>::iterator p = mdr->cap_releases.begin();
@@ -2693,7 +2700,7 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll
 
   EUpdate *le = new EUpdate(mds->mdlog, "snap flush");
   mds->mdlog->start_entry(le);
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
 
   // normal metadata updates that we can apply to the head as well.
@@ -2746,7 +2753,7 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll
 
   mut->auth_pin(in);
   mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
-  mdcache->journal_dirty_inode(mut, &le->metablob, in, follows);
+  mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
 
   mds->mdlog->submit_entry(le);
   mds->mdlog->wait_for_safe(new C_Locker_FileUpdate_finish(this, in, mut, false,
@@ -2949,7 +2956,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
   inode_t *pi = in->project_inode(px);
   pi->version = in->pre_dirty();
 
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
 
   _update_cap_fields(in, dirty, m, pi);
@@ -2984,7 +2991,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
   
   mut->auth_pin(in);
   mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
-  mdcache->journal_dirty_inode(mut, &le->metablob, in, follows);
+  mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
 
   mds->mdlog->submit_entry(le);
   mds->mdlog->wait_for_safe(new C_Locker_FileUpdate_finish(this, in, mut, change_max, 
@@ -3557,11 +3564,12 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
       }
     }
     
+    bool need_recover = false;
     if (lock->get_type() == CEPH_LOCK_IFILE) {
       assert(in);
       if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
         mds->mdcache->queue_file_recover(in);
-        mds->mdcache->do_file_recover();
+	need_recover = true;
         gather++;
       }
     }
@@ -3575,6 +3583,8 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
 
     if (gather) {
       lock->get_parent()->auth_pin(lock);
+      if (need_recover)
+	mds->mdcache->do_file_recover();
       return false;
     }
   }
@@ -3695,11 +3705,12 @@ void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
     }
   }
 
+  bool need_recover = false;
   if (lock->get_type() == CEPH_LOCK_IFILE) {
     assert(in);
     if(in->state_test(CInode::STATE_NEEDSRECOVER)) {
       mds->mdcache->queue_file_recover(in);
-      mds->mdcache->do_file_recover();
+      need_recover = true;
       gather++;
     }
   }
@@ -3730,6 +3741,8 @@ void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
 
   if (gather) {
     lock->get_parent()->auth_pin(lock);
+    if (need_recover)
+      mds->mdcache->do_file_recover();
   } else {
     lock->set_state(LOCK_LOCK);
     lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
@@ -3827,7 +3840,7 @@ void Locker::scatter_writebehind(ScatterLock *lock)
   dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl;
 
   // journal
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
 
   // forcefully take a wrlock
@@ -3847,7 +3860,7 @@ void Locker::scatter_writebehind(ScatterLock *lock)
   mds->mdlog->start_entry(le);
 
   mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mut, &le->metablob, in);
+  mdcache->journal_dirty_inode(mut.get(), &le->metablob, in);
   
   in->finish_scatter_gather_update_accounted(lock->get_type(), mut, &le->metablob);
 
@@ -3855,7 +3868,7 @@ void Locker::scatter_writebehind(ScatterLock *lock)
   mds->mdlog->wait_for_safe(new C_Locker_ScatterWB(this, lock, mut));
 }
 
-void Locker::scatter_writebehind_finish(ScatterLock *lock, Mutation *mut)
+void Locker::scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut)
 {
   CInode *in = static_cast<CInode*>(lock->get_parent());
   dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl;
@@ -3876,9 +3889,8 @@ void Locker::scatter_writebehind_finish(ScatterLock *lock, Mutation *mut)
   }
 
   mut->apply();
-  drop_locks(mut);
+  drop_locks(mut.get());
   mut->cleanup();
-  delete mut;
 
   if (lock->is_stable())
     lock->finish_waiters(ScatterLock::WAIT_STABLE);
@@ -4148,7 +4160,7 @@ void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue)
 // ==========================================================================
 // local lock
 
-void Locker::local_wrlock_grab(LocalLock *lock, Mutation *mut)
+void Locker::local_wrlock_grab(LocalLock *lock, MutationRef& mut)
 {
   dout(7) << "local_wrlock_grab  on " << *lock
 	  << " on " << *lock->get_parent() << dendl;  
@@ -4161,7 +4173,7 @@ void Locker::local_wrlock_grab(LocalLock *lock, Mutation *mut)
   mut->locks.insert(lock);
 }
 
-bool Locker::local_wrlock_start(LocalLock *lock, MDRequest *mut)
+bool Locker::local_wrlock_start(LocalLock *lock, MDRequestRef& mut)
 {
   dout(7) << "local_wrlock_start  on " << *lock
 	  << " on " << *lock->get_parent() << dendl;  
@@ -4179,7 +4191,7 @@ bool Locker::local_wrlock_start(LocalLock *lock, MDRequest *mut)
   }
 }
 
-void Locker::local_wrlock_finish(LocalLock *lock, Mutation *mut)
+void Locker::local_wrlock_finish(LocalLock *lock, MutationImpl *mut)
 {
   dout(7) << "local_wrlock_finish  on " << *lock
 	  << " on " << *lock->get_parent() << dendl;  
@@ -4193,7 +4205,7 @@ void Locker::local_wrlock_finish(LocalLock *lock, Mutation *mut)
   }
 }
 
-bool Locker::local_xlock_start(LocalLock *lock, MDRequest *mut)
+bool Locker::local_xlock_start(LocalLock *lock, MDRequestRef& mut)
 {
   dout(7) << "local_xlock_start  on " << *lock
 	  << " on " << *lock->get_parent() << dendl;  
@@ -4210,7 +4222,7 @@ bool Locker::local_xlock_start(LocalLock *lock, MDRequest *mut)
   return true;
 }
 
-void Locker::local_xlock_finish(LocalLock *lock, Mutation *mut)
+void Locker::local_xlock_finish(LocalLock *lock, MutationImpl *mut)
 {
   dout(7) << "local_xlock_finish  on " << *lock
 	  << " on " << *lock->get_parent() << dendl;  
@@ -4390,15 +4402,18 @@ void Locker::scatter_mix(ScatterLock *lock, bool *need_issue)
 	issue_caps(in);
       gather++;
     }
+    bool need_recover = false;
     if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
       mds->mdcache->queue_file_recover(in);
-      mds->mdcache->do_file_recover();
+      need_recover = true;
       gather++;
     }
 
-    if (gather)
+    if (gather) {
       lock->get_parent()->auth_pin(lock);
-    else {
+      if (need_recover)
+	mds->mdcache->do_file_recover();
+    } else {
       in->start_scatter(lock);
       lock->set_state(LOCK_MIX);
       lock->clear_scatter_wanted();
@@ -4462,14 +4477,17 @@ void Locker::file_excl(ScatterLock *lock, bool *need_issue)
       issue_caps(in);
     gather++;
   }
+  bool need_recover = false;
   if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
     mds->mdcache->queue_file_recover(in);
-    mds->mdcache->do_file_recover();
+    need_recover = true;
     gather++;
   }
   
   if (gather) {
     lock->get_parent()->auth_pin(lock);
+    if (need_recover)
+      mds->mdcache->do_file_recover();
   } else {
     lock->set_state(LOCK_EXCL);
     if (need_issue)
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index 6056862..17b3c47 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -29,8 +29,6 @@ class Session;
 class CDir;
 class CInode;
 class CDentry;
-struct Mutation;
-struct MDRequest;
 class EMetaBlob;
 struct SnapRealm;
 
@@ -77,14 +75,14 @@ protected:
   void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data);
 
   // -- locks --
-  void _drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue);
-  void _drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue);
+  void _drop_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue);
+  void _drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue);
 public:
   void include_snap_rdlocks(set<SimpleLock*>& rdlocks, CInode *in);
   void include_snap_rdlocks_wlayout(set<SimpleLock*>& rdlocks, CInode *in,
                                     ceph_file_layout **layout);
 
-  bool acquire_locks(MDRequest *mdr,
+  bool acquire_locks(MDRequestRef& mdr,
 		     set<SimpleLock*> &rdlocks,
 		     set<SimpleLock*> &wrlocks,
 		     set<SimpleLock*> &xlocks,
@@ -92,11 +90,11 @@ public:
 		     CInode *auth_pin_freeze=NULL,
 		     bool auth_pin_nonblock=false);
 
-  void cancel_locking(Mutation *mut, set<CInode*> *pneed_issue);
-  void drop_locks(Mutation *mut, set<CInode*> *pneed_issue=0);
-  void set_xlocks_done(Mutation *mut, bool skip_dentry=false);
-  void drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue=0);
-  void drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue=0);
+  void cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue);
+  void drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue=0);
+  void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
+  void drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue=0);
+  void drop_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue=0);
 
   void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, list<Context*> *pfinishers=0);
   void eval(SimpleLock *lock, bool *need_issue);
@@ -129,25 +127,25 @@ public:
 
   bool _rdlock_kick(SimpleLock *lock, bool as_anon);
   bool rdlock_try(SimpleLock *lock, client_t client, Context *c);
-  bool rdlock_start(SimpleLock *lock, MDRequest *mut, bool as_anon=false);
-  void rdlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue);
+  bool rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon=false);
+  void rdlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue);
   bool can_rdlock_set(set<SimpleLock*>& locks);
   bool rdlock_try_set(set<SimpleLock*>& locks);
-  void rdlock_take_set(set<SimpleLock*>& locks, Mutation *mut);
+  void rdlock_take_set(set<SimpleLock*>& locks, MutationRef& mut);
 
-  void wrlock_force(SimpleLock *lock, Mutation *mut);
-  bool wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait=false);
-  void wrlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue);
+  void wrlock_force(SimpleLock *lock, MutationRef& mut);
+  bool wrlock_start(SimpleLock *lock, MDRequestRef& mut, bool nowait=false);
+  void wrlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue);
 
-  void remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut);
-  void remote_wrlock_finish(SimpleLock *lock, int target, Mutation *mut);
+  void remote_wrlock_start(SimpleLock *lock, int target, MDRequestRef& mut);
+  void remote_wrlock_finish(SimpleLock *lock, int target, MutationImpl *mut);
 
-  bool xlock_start(SimpleLock *lock, MDRequest *mut);
+  bool xlock_start(SimpleLock *lock, MDRequestRef& mut);
   void _finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue);
-  void xlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue);
+  void xlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue);
 
-  void xlock_export(SimpleLock *lock, Mutation *mut);
-  void xlock_import(SimpleLock *lock, Mutation *mut);
+  void xlock_export(SimpleLock *lock, MutationImpl *mut);
+  void xlock_import(SimpleLock *lock);
 
 
   // simple
@@ -184,14 +182,15 @@ protected:
   class C_Locker_ScatterWB : public Context {
     Locker *locker;
     ScatterLock *lock;
-    Mutation *mut;
+    MutationRef mut;
   public:
-    C_Locker_ScatterWB(Locker *l, ScatterLock *sl, Mutation *m) : locker(l), lock(sl), mut(m) {}
+    C_Locker_ScatterWB(Locker *l, ScatterLock *sl, MutationRef& m) :
+      locker(l), lock(sl), mut(m) {}
     void finish(int r) { 
       locker->scatter_writebehind_finish(lock, mut); 
     }
   };
-  void scatter_writebehind_finish(ScatterLock *lock, Mutation *mut);
+  void scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut);
 
   xlist<ScatterLock*> updated_scatterlocks;
 public:
@@ -209,10 +208,10 @@ public:
   // process_request_cap_release to preserve ordering.
   bool should_defer_client_cap_frozen(CInode *in);
 
-  void process_request_cap_release(MDRequest *mdr, client_t client, const ceph_mds_request_release& r,
+  void process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& r,
 				   const string &dname);
 
-  void kick_cap_releases(MDRequest *mdr);
+  void kick_cap_releases(MDRequestRef& mdr);
   void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq);
 
   void remove_client_cap(CInode *in, client_t client);
@@ -231,12 +230,12 @@ public:
 
   // local
 public:
-  void local_wrlock_grab(LocalLock *lock, Mutation *mut);
+  void local_wrlock_grab(LocalLock *lock, MutationRef& mut);
 protected:
-  bool local_wrlock_start(LocalLock *lock, MDRequest *mut);
-  void local_wrlock_finish(LocalLock *lock, Mutation *mut);
-  bool local_xlock_start(LocalLock *lock, MDRequest *mut);
-  void local_xlock_finish(LocalLock *lock, Mutation *mut);
+  bool local_wrlock_start(LocalLock *lock, MDRequestRef& mut);
+  void local_wrlock_finish(LocalLock *lock, MutationImpl *mut);
+  bool local_xlock_start(LocalLock *lock, MDRequestRef& mut);
+  void local_xlock_finish(LocalLock *lock, MutationImpl *mut);
 
 
   // file
@@ -272,7 +271,7 @@ public:
 protected:
   void handle_inode_file_caps(class MInodeFileCaps *m);
 
-  void file_update_finish(CInode *in, Mutation *mut, bool share, client_t client, Capability *cap,
+  void file_update_finish(CInode *in, MutationRef& mut, bool share, client_t client, Capability *cap,
 			  MClientCaps *ack);
 public:
   void calc_new_client_ranges(CInode *in, uint64_t size, map<client_t, client_writeable_range_t>& new_ranges);
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
index 2fffe3b..a2b8ace 100644
--- a/src/mds/LogSegment.h
+++ b/src/mds/LogSegment.h
@@ -67,7 +67,7 @@ class LogSegment {
   map<int,version_t> tablev;
 
   // try to expire
-  void try_to_expire(MDS *mds, C_GatherBuilder &gather_bld);
+  void try_to_expire(MDS *mds, C_GatherBuilder &gather_bld, int op_prio);
 
   // cons
   LogSegment(loff_t off) :
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index b75bab0..49f3308 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -411,11 +411,11 @@ void MDCache::create_mydir_hierarchy(C_Gather *gather)
 
 struct C_MDC_CreateSystemFile : public Context {
   MDCache *cache;
-  Mutation *mut;
+  MutationRef mut;
   CDentry *dn;
   version_t dpv;
   Context *fin;
-  C_MDC_CreateSystemFile(MDCache *c, Mutation *mu, CDentry *d, version_t v, Context *f) :
+  C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, Context *f) :
     cache(c), mut(mu), dn(d), dpv(v), fin(f) {}
   void finish(int r) {
     cache->_create_system_file_finish(mut, dn, dpv, fin);
@@ -444,7 +444,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte
   SnapRealm *realm = dir->get_inode()->find_snaprealm();
   dn->first = in->first = realm->get_newest_seq() + 1;
 
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
 
   // force some locks.  hacky.
   mds->locker->wrlock_force(&dir->inode->filelock, mut);
@@ -459,7 +459,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte
     le->metablob.add_primary_dentry(dn, in, true);
   } else {
     predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
-    journal_dirty_inode(mut, &le->metablob, in);
+    journal_dirty_inode(mut.get(), &le->metablob, in);
     dn->push_projected_linkage(in->ino(), in->d_type());
     le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
     le->metablob.add_root(true, in);
@@ -472,7 +472,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte
   mds->mdlog->flush();
 }
 
-void MDCache::_create_system_file_finish(Mutation *mut, CDentry *dn, version_t dpv, Context *fin)
+void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, Context *fin)
 {
   dout(10) << "_create_system_file_finish " << *dn << dendl;
   
@@ -491,9 +491,8 @@ void MDCache::_create_system_file_finish(Mutation *mut, CDentry *dn, version_t d
   }
 
   mut->apply();
-  mds->locker->drop_locks(mut);
+  mds->locker->drop_locks(mut.get());
   mut->cleanup();
-  delete mut;
 
   fin->complete(0);
 
@@ -799,9 +798,9 @@ void MDCache::try_subtree_merge(CDir *dir)
 class C_MDC_SubtreeMergeWB : public Context {
   MDCache *mdcache;
   CInode *in;
-  Mutation *mut;
+  MutationRef mut;
 public:
-  C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, Mutation *m) : mdcache(mdc), in(i), mut(m) {}
+  C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : mdcache(mdc), in(i), mut(m) {}
   void finish(int r) { 
     mdcache->subtree_merge_writebehind_finish(in, mut);
   }
@@ -868,13 +867,13 @@ void MDCache::try_subtree_merge_at(CDir *dir, bool do_eval)
       inode_t *pi = in->project_inode();
       pi->version = in->pre_dirty();
       
-      Mutation *mut = new Mutation;
+      MutationRef mut(new MutationImpl);
       mut->ls = mds->mdlog->get_current_segment();
       EUpdate *le = new EUpdate(mds->mdlog, "subtree merge writebehind");
       mds->mdlog->start_entry(le);
 
       le->metablob.add_dir_context(in->get_parent_dn()->get_dir());
-      journal_dirty_inode(mut, &le->metablob, in);
+      journal_dirty_inode(mut.get(), &le->metablob, in);
       
       mds->mdlog->submit_entry(le);
       mds->mdlog->wait_for_safe(new C_MDC_SubtreeMergeWB(this, in, mut));
@@ -885,15 +884,14 @@ void MDCache::try_subtree_merge_at(CDir *dir, bool do_eval)
   show_subtrees(15);
 }
 
-void MDCache::subtree_merge_writebehind_finish(CInode *in, Mutation *mut)
+void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
 {
   dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
   in->pop_and_dirty_projected_inode(mut->ls);
 
   mut->apply();
-  mds->locker->drop_locks(mut);
+  mds->locker->drop_locks(mut.get());
   mut->cleanup();
-  delete mut;
 
   in->auth_unpin(this);
 }
@@ -1486,7 +1484,8 @@ CInode *MDCache::cow_inode(CInode *in, snapid_t last)
   return oldin;
 }
 
-void MDCache::journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn, snapid_t follows,
+void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
+                                 CDentry *dn, snapid_t follows,
 				 CInode **pcow_inode, CDentry::linkage_t *dnl)
 {
   if (!dn) {
@@ -1569,15 +1568,16 @@ void MDCache::journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn
 }
 
 
-void MDCache::journal_cow_inode(Mutation *mut, EMetaBlob *metablob, CInode *in, snapid_t follows,
+void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
+                                CInode *in, snapid_t follows,
 				CInode **pcow_inode)
 {
   dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
   CDentry *dn = in->get_projected_parent_dn();
-  journal_cow_dentry(mut, metablob, dn, follows, pcow_inode);
+  journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
 }
 
-void MDCache::journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
+void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
 {
   if (in->is_base()) {
     metablob->add_root(true, in, in->get_projected_inode());
@@ -1848,7 +1848,7 @@ void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accou
  * accounted_rstat on scatterlock sync may not match our current
  * rstat.  this is normal and expected.
  */
-void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
+void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 				       CInode *in, CDir *parent,
 				       int flags, int linkunlink,
 				       snapid_t cfollows)
@@ -1996,11 +1996,14 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
       }
     }
 
+    // can cast only because i'm passing nowait=true in the sole user
+    MDRequestRef mdmut =
+      ceph::static_pointer_cast<MDRequestImpl,MutationImpl>(mut);
     if (!stop &&
 	mut->wrlocks.count(&pin->nestlock) == 0 &&
 	(!pin->versionlock.can_wrlock() ||                   // make sure we can take versionlock, too
 	 //true
-	 !mds->locker->wrlock_start(&pin->nestlock, static_cast<MDRequest*>(mut), true) // can cast only because i'm passing nowait=true
+	 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
 	 )) {  // ** do not initiate.. see above comment **
       dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
 	       << " on " << *pin << dendl;
@@ -2129,7 +2132,7 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
        p != lsi.end();
        ++p) {
     CInode *cur = *p;
-    journal_dirty_inode(mut, blob, cur);
+    journal_dirty_inode(mut.get(), blob, cur);
   }
  
 }
@@ -2491,19 +2494,20 @@ void MDCache::send_slave_resolves()
   } else {
     set<int> resolve_set;
     mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
-    for (ceph::unordered_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
-      if (!p->second->is_slave() || !p->second->slave_did_prepare())
+      MDRequestRef& mdr = p->second;
+      if (!mdr->is_slave() || !mdr->slave_did_prepare())
 	continue;
-      int master = p->second->slave_to_mds;
+      int master = mdr->slave_to_mds;
       if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
-	dout(10) << " including uncommitted " << *p->second << dendl;
+	dout(10) << " including uncommitted " << *mdr << dendl;
 	if (!resolves.count(master))
 	  resolves[master] = new MMDSResolve;
-	if (p->second->has_more() && p->second->more()->is_inode_exporter) {
+	if (mdr->has_more() && mdr->more()->is_inode_exporter) {
 	  // re-send cap exports
-	  CInode *in = p->second->more()->rename_inode;
+	  CInode *in = mdr->more()->rename_inode;
 	  map<client_t, Capability::Export> cap_map;
 	  in->export_client_caps(cap_map);
 	  bufferlist bl;
@@ -2644,94 +2648,95 @@ void MDCache::handle_mds_failure(int who)
   migrator->handle_mds_failure_or_stop(who);
 
   // clean up any requests slave to/from this node
-  list<MDRequest*> finish;
-  for (ceph::unordered_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+  list<MDRequestRef> finish;
+  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
        p != active_requests.end();
        ++p) {
+    MDRequestRef& mdr = p->second;;
     // slave to the failed node?
-    if (p->second->slave_to_mds == who) {
-      if (p->second->slave_did_prepare()) {
-	dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
-	if (!p->second->more()->waiting_on_slave.empty()) {
-	  assert(p->second->more()->srcdn_auth_mds == mds->get_nodeid());
+    if (mdr->slave_to_mds == who) {
+      if (mdr->slave_did_prepare()) {
+	dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
+	if (!mdr->more()->waiting_on_slave.empty()) {
+	  assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
 	  // will rollback, no need to wait
-	  if (p->second->slave_request) {
-	    p->second->slave_request->put();
-	    p->second->slave_request = 0;
+	  if (mdr->slave_request) {
+	    mdr->slave_request->put();
+	    mdr->slave_request = 0;
 	  }
-	  p->second->more()->waiting_on_slave.clear();
+	  mdr->more()->waiting_on_slave.clear();
 	}
       } else {
-	dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl;
-	if (p->second->slave_request)
-	  p->second->aborted = true;
+	dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
+	if (mdr->slave_request)
+	  mdr->aborted = true;
 	else
-	  finish.push_back(p->second);
+	  finish.push_back(mdr);
       }
     }
 
-    if (p->second->is_slave() && p->second->slave_did_prepare()) {
-      if (p->second->more()->waiting_on_slave.count(who)) {
-	assert(p->second->more()->srcdn_auth_mds == mds->get_nodeid());
-	dout(10) << " slave request " << *p->second << " no longer need rename notity ack from mds."
+    if (mdr->is_slave() && mdr->slave_did_prepare()) {
+      if (mdr->more()->waiting_on_slave.count(who)) {
+	assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+	dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
 		 << who << dendl;
-	p->second->more()->waiting_on_slave.erase(who);
-	if (p->second->more()->waiting_on_slave.empty() && p->second->slave_request)
-	  mds->queue_waiter(new C_MDS_RetryRequest(this, p->second));
+	mdr->more()->waiting_on_slave.erase(who);
+	if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
+	  mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
       }
 
-      if (p->second->more()->srcdn_auth_mds == who &&
-	  mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
+      if (mdr->more()->srcdn_auth_mds == who &&
+	  mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
 	// rename srcdn's auth mds failed, resolve even I'm a survivor.
-	dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
-	add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+	dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
+	add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
       }
     }
     
     // failed node is slave?
-    if (p->second->is_master() && !p->second->committing) {
-      if (p->second->more()->srcdn_auth_mds == who) {
-	dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
+    if (mdr->is_master() && !mdr->committing) {
+      if (mdr->more()->srcdn_auth_mds == who) {
+	dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
 		 << who << " to recover" << dendl;
-	assert(p->second->more()->witnessed.count(who) == 0);
-	if (p->second->more()->is_ambiguous_auth)
-	  p->second->clear_ambiguous_auth();
+	assert(mdr->more()->witnessed.count(who) == 0);
+	if (mdr->more()->is_ambiguous_auth)
+	  mdr->clear_ambiguous_auth();
 	// rename srcdn's auth mds failed, all witnesses will rollback
-	p->second->more()->witnessed.clear();
+	mdr->more()->witnessed.clear();
 	pending_masters.erase(p->first);
       }
 
-      if (p->second->more()->witnessed.count(who)) {
-	int srcdn_auth = p->second->more()->srcdn_auth_mds;
-	if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) {
-	  dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
-		   << p->second->more()->srcdn_auth_mds << " to reply" << dendl;
+      if (mdr->more()->witnessed.count(who)) {
+	int srcdn_auth = mdr->more()->srcdn_auth_mds;
+	if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
+	  dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
+		   << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
 	  // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
 	  // until either the request is committing or the slave also fails.
-	  assert(p->second->more()->waiting_on_slave.size() == 1);
+	  assert(mdr->more()->waiting_on_slave.size() == 1);
 	  pending_masters.insert(p->first);
 	} else {
-	  dout(10) << " master request " << *p->second << " no longer witnessed by slave mds."
+	  dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
 		   << who << " to recover" << dendl;
 	  if (srcdn_auth >= 0)
-	    assert(p->second->more()->witnessed.count(srcdn_auth) == 0);
+	    assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
 
 	  // discard this peer's prepare (if any)
-	  p->second->more()->witnessed.erase(who);
+	  mdr->more()->witnessed.erase(who);
 	}
       }
       
-      if (p->second->more()->waiting_on_slave.count(who)) {
-	dout(10) << " master request " << *p->second << " waiting for slave mds." << who
+      if (mdr->more()->waiting_on_slave.count(who)) {
+	dout(10) << " master request " << *mdr << " waiting for slave mds." << who
 		 << " to recover" << dendl;
 	// retry request when peer recovers
-	p->second->more()->waiting_on_slave.erase(who);
-	if (p->second->more()->waiting_on_slave.empty())
-	  mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
+	mdr->more()->waiting_on_slave.erase(who);
+	if (mdr->more()->waiting_on_slave.empty())
+	  mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
       }
 
-      if (p->second->locking && p->second->locking_target_mds == who)
-	p->second->finish_locking(p->second->locking);
+      if (mdr->locking && mdr->locking_target_mds == who)
+	mdr->finish_locking(mdr->locking);
     }
   }
 
@@ -3088,7 +3093,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
 
       finish_uncommitted_slave_update(p->first, from);
     } else {
-      MDRequest *mdr = request_get(p->first);
+      MDRequestRef mdr = request_get(p->first);
       // information about master imported caps
       if (p->second.length() > 0)
 	mdr->more()->inode_import.claim(p->second);
@@ -3109,21 +3114,22 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
 
       // perform rollback (and journal a rollback entry)
       // note: this will hold up the resolve a bit, until the rollback entries journal.
+      MDRequestRef null_ref;
       switch (su->origop) {
       case ESlaveUpdate::LINK:
-	mds->server->do_link_rollback(su->rollback, from, 0);
+	mds->server->do_link_rollback(su->rollback, from, null_ref);
 	break;
       case ESlaveUpdate::RENAME:
-	mds->server->do_rename_rollback(su->rollback, from, 0);
+	mds->server->do_rename_rollback(su->rollback, from, null_ref);
 	break;
       case ESlaveUpdate::RMDIR:
-	mds->server->do_rmdir_rollback(su->rollback, from, 0);
+	mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
 	break;
       default:
 	assert(0);
       }
     } else {
-      MDRequest *mdr = request_get(*p);
+      MDRequestRef mdr = request_get(*p);
       mdr->aborted = true;
       if (mdr->slave_request) {
 	if (mdr->more()->slave_commit) // journaling slave prepare ?
@@ -3694,69 +3700,70 @@ void MDCache::rejoin_send_rejoins()
   if (!mds->is_rejoin()) {
     // i am survivor.  send strong rejoin.
     // note request remote_auth_pins, xlocks
-    for (ceph::unordered_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
-      if ( p->second->is_slave())
+      MDRequestRef& mdr = p->second;
+      if (mdr->is_slave())
 	continue;
       // auth pins
-      for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin();
-	   q != p->second->remote_auth_pins.end();
+      for (set<MDSCacheObject*>::iterator q = mdr->remote_auth_pins.begin();
+	   q != mdr->remote_auth_pins.end();
 	   ++q) {
 	if (!(*q)->is_auth()) {
 	  int who = (*q)->authority().first;
 	  if (rejoins.count(who) == 0) continue;
 	  MMDSCacheRejoin *rejoin = rejoins[who];
 	  
-	  dout(15) << " " << *p->second << " authpin on " << **q << dendl;
+	  dout(15) << " " << *mdr << " authpin on " << **q << dendl;
 	  MDSCacheObjectInfo i;
 	  (*q)->set_object_info(i);
 	  if (i.ino)
-	    rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), p->second->reqid, p->second->attempt);
+	    rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
 	  else
-	    rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, p->second->reqid, p->second->attempt);
+	    rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
 
-	  if (p->second->has_more() && p->second->more()->is_remote_frozen_authpin &&
-	      p->second->more()->rename_inode == (*q))
+	  if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
+	      mdr->more()->rename_inode == (*q))
 	    rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
-					     p->second->reqid, p->second->attempt);
+					     mdr->reqid, mdr->attempt);
 	}
       }
       // xlocks
-      for (set<SimpleLock*>::iterator q = p->second->xlocks.begin();
-	   q != p->second->xlocks.end();
+      for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
+	   q != mdr->xlocks.end();
 	   ++q) {
 	if (!(*q)->get_parent()->is_auth()) {
 	  int who = (*q)->get_parent()->authority().first;
 	  if (rejoins.count(who) == 0) continue;
 	  MMDSCacheRejoin *rejoin = rejoins[who];
 	  
-	  dout(15) << " " << *p->second << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
+	  dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
 	  MDSCacheObjectInfo i;
 	  (*q)->get_parent()->set_object_info(i);
 	  if (i.ino)
 	    rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
-				    p->second->reqid, p->second->attempt);
+				    mdr->reqid, mdr->attempt);
 	  else
 	    rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
-				     p->second->reqid, p->second->attempt);
+				     mdr->reqid, mdr->attempt);
 	}
       }
       // remote wrlocks
-      for (map<SimpleLock*, int>::iterator q = p->second->remote_wrlocks.begin();
-	   q != p->second->remote_wrlocks.end();
+      for (map<SimpleLock*, int>::iterator q = mdr->remote_wrlocks.begin();
+	   q != mdr->remote_wrlocks.end();
 	   ++q) {
 	int who = q->second;
 	if (rejoins.count(who) == 0) continue;
 	MMDSCacheRejoin *rejoin = rejoins[who];
 
-	dout(15) << " " << *p->second << " wrlock on " << q->second
+	dout(15) << " " << *mdr << " wrlock on " << q->second
 		 << " " << q->first->get_parent() << dendl;
 	MDSCacheObjectInfo i;
 	q->first->get_parent()->set_object_info(i);
 	assert(i.ino);
 	rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
-				 p->second->reqid, p->second->attempt);
+				 mdr->reqid, mdr->attempt);
       }
     }
   }
@@ -4474,7 +4481,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
 
 	  // get/create slave mdrequest
-	  MDRequest *mdr;
+	  MDRequestRef mdr;
 	  if (have_request(r->reqid))
 	    mdr = request_get(r->reqid);
 	  else
@@ -4488,7 +4495,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  strong->xlocked_dentries[p->first].count(q->first)) {
 	MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
 	dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
-	MDRequest *mdr = request_get(r.reqid);  // should have this from auth_pin above.
+	MDRequestRef mdr = request_get(r.reqid);  // should have this from auth_pin above.
 	assert(mdr->is_auth_pinned(dn));
 	if (!mdr->xlocks.count(&dn->versionlock)) {
 	  assert(dn->versionlock.can_xlock_local());
@@ -4569,7 +4576,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
 
 	// get/create slave mdrequest
-	MDRequest *mdr;
+	MDRequestRef mdr;
 	if (have_request(r->reqid))
 	  mdr = request_get(r->reqid);
 	else
@@ -4590,7 +4597,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	   ++q) {
 	SimpleLock *lock = in->get_lock(q->first);
 	dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
-	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
+	MDRequestRef mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
 	assert(mdr->is_auth_pinned(in));
 	if (!mdr->xlocks.count(&in->versionlock)) {
 	  assert(in->versionlock.can_xlock_local());
@@ -4622,7 +4629,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  r != q->second.end();
 	  ++r) {
 	dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
-	MDRequest *mdr = request_get(r->reqid);  // should have this from auth_pin above.
+	MDRequestRef mdr = request_get(r->reqid);  // should have this from auth_pin above.
 	if (in->is_auth())
 	  assert(mdr->is_auth_pinned(in));
 	lock->set_state(LOCK_MIX);
@@ -5471,6 +5478,8 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
     if (cap->get_last_seq() == 0) // reconnected cap
       cap->inc_last_seq();
     cap->set_last_issue();
+    cap->set_last_issue_stamp(ceph_clock_now(g_ceph_context));
+    cap->clear_new();
     MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
 					in->ino(),
 					realm->inode->ino(),
@@ -5857,8 +5866,9 @@ void MDCache::reissue_all_caps()
 struct C_MDC_QueuedCow : public Context {
   MDCache *mdcache;
   CInode *in;
-  Mutation *mut;
-  C_MDC_QueuedCow(MDCache *mdc, CInode *i, Mutation *m) : mdcache(mdc), in(i), mut(m) {}
+  MutationRef mut;
+  C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
+    mdcache(mdc), in(i), mut(m) {}
   void finish(int r) {
     mdcache->_queued_file_recover_cow(in, mut);
   }
@@ -5881,7 +5891,7 @@ void MDCache::queue_file_recover(CInode *in)
     inode_t *pi = in->project_inode();
     pi->version = in->pre_dirty();
 
-    Mutation *mut = new Mutation;
+    MutationRef mut(new MutationImpl);
     mut->ls = mds->mdlog->get_current_segment();
     EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
     mds->mdlog->start_entry(le);
@@ -5906,13 +5916,12 @@ void MDCache::queue_file_recover(CInode *in)
   _queue_file_recover(in);
 }
 
-void MDCache::_queued_file_recover_cow(CInode *in, Mutation *mut)
+void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
 {
   in->pop_and_dirty_projected_inode(mut->ls);
   mut->apply();
-  mds->locker->drop_locks(mut);
+  mds->locker->drop_locks(mut.get());
   mut->cleanup();
-  delete mut;
 }
 
 void MDCache::_queue_file_recover(CInode *in)
@@ -6146,8 +6155,9 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
 struct C_MDC_TruncateLogged : public Context {
   MDCache *mdc;
   CInode *in;
-  Mutation *mut;
-  C_MDC_TruncateLogged(MDCache *m, CInode *i, Mutation *mu) : mdc(m), in(i), mut(mu) {}
+  MutationRef mut;
+  C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
+    mdc(m), in(i), mut(mu) {}
   void finish(int r) {
     mdc->truncate_inode_logged(in, mut);
   }
@@ -6167,7 +6177,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
   pi->truncate_from = 0;
   pi->truncate_pending--;
 
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
   mut->add_projected_inode(in);
 
@@ -6178,7 +6188,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
   le->metablob.add_primary_dentry(dn, in, true);
   le->metablob.add_truncate_finish(in->ino(), ls->offset);
 
-  journal_dirty_inode(mut, &le->metablob, in);
+  journal_dirty_inode(mut.get(), &le->metablob, in);
   mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
 
   // flush immediately if there are readers/writers waiting
@@ -6186,13 +6196,12 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
     mds->mdlog->flush();
 }
 
-void MDCache::truncate_inode_logged(CInode *in, Mutation *mut)
+void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
 {
   dout(10) << "truncate_inode_logged " << *in << dendl;
   mut->apply();
-  mds->locker->drop_locks(mut);
+  mds->locker->drop_locks(mut.get());
   mut->cleanup();
-  delete mut;
 
   in->put(CInode::PIN_TRUNCATING);
   in->auth_unpin(this);
@@ -7509,7 +7518,7 @@ void MDCache::dispatch(Message *m)
   }
 }
 
-Context *MDCache::_get_waiter(MDRequest *mdr, Message *req, Context *fin)
+Context *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, Context *fin)
 {
   if (mdr) {
     dout(20) << "_get_waiter retryrequest" << dendl;
@@ -7522,7 +7531,7 @@ Context *MDCache::_get_waiter(MDRequest *mdr, Message *req, Context *fin)
   }
 }
 
-int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin,     // who
+int MDCache::path_traverse(MDRequestRef& mdr, Message *req, Context *fin,     // who
 			   const filepath& path,                   // what
                            vector<CDentry*> *pdnvec,         // result
 			   CInode **pin,
@@ -7969,7 +7978,7 @@ void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, Context *fin)
  * will return inode for primary, or link up/open up remote link's inode as necessary.
  * If it's not available right now, puts mdr on wait list and returns null.
  */
-CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr, bool projected)
+CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
 {
   CDentry::linkage_t *dnl;
   if (projected)
@@ -8788,7 +8797,8 @@ void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
     if (!m->path.empty()) {
       // we got a path!
       vector<CDentry*> trace;
-      int r = path_traverse(NULL, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
+      MDRequestRef null_ref;
+      int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
       if (r > 0)
 	return; 
       dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path 
@@ -8828,21 +8838,23 @@ void MDCache::kick_find_ino_peers(int who)
 int MDCache::get_num_client_requests()
 {
   int count = 0;
-  for (ceph::unordered_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
       p != active_requests.end();
       ++p) {
-    if (p->second->reqid.name.is_client() && !p->second->is_slave())
+    MDRequestRef& mdr = p->second;
+    if (mdr->reqid.name.is_client() && !mdr->is_slave())
       count++;
   }
   return count;
 }
 
 /* This function takes over the reference to the passed Message */
-MDRequest *MDCache::request_start(MClientRequest *req)
+MDRequestRef MDCache::request_start(MClientRequest *req)
 {
   // did we win a forward race against a slave?
   if (active_requests.count(req->get_reqid())) {
-    MDRequest *mdr = active_requests[req->get_reqid()];
+    MDRequestRef& mdr = active_requests[req->get_reqid()];
+    assert(mdr);
     if (mdr->is_slave()) {
       dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
       mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
@@ -8850,28 +8862,29 @@ MDRequest *MDCache::request_start(MClientRequest *req)
       dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
       req->put();
     }
-    return 0;
+    return MDRequestRef();
   }
 
   // register new client request
-  MDRequest *mdr = new MDRequest(req->get_reqid(), req->get_num_fwd(), req);
+  MDRequestRef mdr(new MDRequestImpl(req->get_reqid(),
+                                     req->get_num_fwd(), req));
   active_requests[req->get_reqid()] = mdr;
   dout(7) << "request_start " << *mdr << dendl;
   return mdr;
 }
 
-MDRequest *MDCache::request_start_slave(metareqid_t ri, __u32 attempt, int by)
+MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, int by)
 {
-  MDRequest *mdr = new MDRequest(ri, attempt, by);
+  MDRequestRef mdr(new MDRequestImpl(ri, attempt, by));
   assert(active_requests.count(mdr->reqid) == 0);
   active_requests[mdr->reqid] = mdr;
   dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
   return mdr;
 }
 
-MDRequest *MDCache::request_start_internal(int op)
+MDRequestRef MDCache::request_start_internal(int op)
 {
-  MDRequest *mdr = new MDRequest;
+  MDRequestRef mdr(new MDRequestImpl);
   mdr->reqid.name = entity_name_t::MDS(mds->get_nodeid());
   mdr->reqid.tid = mds->issue_tid();
   mdr->internal_op = op;
@@ -8882,15 +8895,15 @@ MDRequest *MDCache::request_start_internal(int op)
   return mdr;
 }
 
-
-MDRequest *MDCache::request_get(metareqid_t rid)
+MDRequestRef MDCache::request_get(metareqid_t rid)
 {
-  assert(active_requests.count(rid));
-  dout(7) << "request_get " << rid << " " << *active_requests[rid] << dendl;
-  return active_requests[rid];
+  ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
+  assert(p != active_requests.end());
+  dout(7) << "request_get " << rid << " " << *p->second << dendl;
+  return p->second;
 }
 
-void MDCache::request_finish(MDRequest *mdr)
+void MDCache::request_finish(MDRequestRef& mdr)
 {
   dout(7) << "request_finish " << *mdr << dendl;
 
@@ -8906,7 +8919,7 @@ void MDCache::request_finish(MDRequest *mdr)
 }
 
 
-void MDCache::request_forward(MDRequest *mdr, int who, int port)
+void MDCache::request_forward(MDRequestRef& mdr, int who, int port)
 {
   if (mdr->client_request->get_source().is_client()) {
     dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
@@ -8922,7 +8935,7 @@ void MDCache::request_forward(MDRequest *mdr, int who, int port)
 }
 
 
-void MDCache::dispatch_request(MDRequest *mdr)
+void MDCache::dispatch_request(MDRequestRef& mdr)
 {
   if (mdr->killed) {
     dout(10) << "request " << *mdr << " was killed" << dendl;
@@ -8947,7 +8960,7 @@ void MDCache::dispatch_request(MDRequest *mdr)
 }
 
 
-void MDCache::request_drop_foreign_locks(MDRequest *mdr)
+void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
 {
   if (!mdr->has_more())
     return;
@@ -8999,19 +9012,19 @@ void MDCache::request_drop_foreign_locks(MDRequest *mdr)
                                 * this function can get called more than once */
 }
 
-void MDCache::request_drop_non_rdlocks(MDRequest *mdr)
+void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
 {
   request_drop_foreign_locks(mdr);
-  mds->locker->drop_non_rdlocks(mdr);
+  mds->locker->drop_non_rdlocks(mdr.get());
 }
 
-void MDCache::request_drop_locks(MDRequest *mdr)
+void MDCache::request_drop_locks(MDRequestRef& mdr)
 {
   request_drop_foreign_locks(mdr);
-  mds->locker->drop_locks(mdr);
+  mds->locker->drop_locks(mdr.get());
 }
 
-void MDCache::request_cleanup(MDRequest *mdr)
+void MDCache::request_cleanup(MDRequestRef& mdr)
 {
   dout(15) << "request_cleanup " << *mdr << dendl;
 
@@ -9045,7 +9058,6 @@ void MDCache::request_cleanup(MDRequest *mdr)
 
   // remove from map
   active_requests.erase(mdr->reqid);
-  mdr->put();
 
   // fail-safe!
   if (was_replay && active_requests.empty()) {
@@ -9057,7 +9069,7 @@ void MDCache::request_cleanup(MDRequest *mdr)
     log_stat();
 }
 
-void MDCache::request_kill(MDRequest *mdr)
+void MDCache::request_kill(MDRequestRef& mdr)
 {
   mdr->killed = true;
   if (!mdr->committing) {
@@ -9084,7 +9096,7 @@ public:
   }
 };
 
-void MDCache::anchor_create_prep_locks(MDRequest *mdr, CInode *in,
+void MDCache::anchor_create_prep_locks(MDRequestRef& mdr, CInode *in,
 				       set<SimpleLock*>& rdlocks, set<SimpleLock*>& xlocks)
 {
   dout(10) << "anchor_create_prep_locks " << *in << dendl;
@@ -9105,7 +9117,7 @@ void MDCache::anchor_create_prep_locks(MDRequest *mdr, CInode *in,
   }
 }
 
-void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish)
+void MDCache::anchor_create(MDRequestRef& mdr, CInode *in, Context *onfinish)
 {
   assert(in->is_auth());
   dout(10) << "anchor_create " << *in << dendl;
@@ -9185,9 +9197,9 @@ class C_MDC_AnchorLogged : public Context {
   MDCache *cache;
   CInode *in;
   version_t atid;
-  Mutation *mut;
+  MutationRef mut;
 public:
-  C_MDC_AnchorLogged(MDCache *c, CInode *i, version_t t, Mutation *m) : 
+  C_MDC_AnchorLogged(MDCache *c, CInode *i, version_t t, MutationRef& m) :
     cache(c), in(i), atid(t), mut(m) {}
   void finish(int r) {
     cache->_anchor_logged(in, atid, mut);
@@ -9211,19 +9223,19 @@ void MDCache::_anchor_prepared(CInode *in, version_t atid, bool add)
   }
   pi->version = in->pre_dirty();
 
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
   EUpdate *le = new EUpdate(mds->mdlog, add ? "anchor_create":"anchor_destroy");
   mds->mdlog->start_entry(le);
   predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
-  journal_dirty_inode(mut, &le->metablob, in);
+  journal_dirty_inode(mut.get(), &le->metablob, in);
   le->metablob.add_table_transaction(TABLE_ANCHOR, atid);
   mds->mdlog->submit_entry(le, new C_MDC_AnchorLogged(this, in, atid, mut));
   mds->mdlog->flush();
 }
 
 
-void MDCache::_anchor_logged(CInode *in, version_t atid, Mutation *mut)
+void MDCache::_anchor_logged(CInode *in, version_t atid, MutationRef& mut)
 {
   dout(10) << "_anchor_logged on " << *in << dendl;
 
@@ -9249,9 +9261,8 @@ void MDCache::_anchor_logged(CInode *in, version_t atid, Mutation *mut)
   mds->anchorclient->commit(atid, mut->ls);
 
   // drop locks and finish
-  mds->locker->drop_locks(mut);
+  mds->locker->drop_locks(mut.get());
   mut->cleanup();
-  delete mut;
 
   // trigger waiters
   in->finish_waiting(CInode::WAIT_ANCHORED|CInode::WAIT_UNANCHORED, 0);
@@ -9263,17 +9274,18 @@ void MDCache::_anchor_logged(CInode *in, version_t atid, Mutation *mut)
 
 struct C_MDC_snaprealm_create_finish : public Context {
   MDCache *cache;
-  MDRequest *mdr;
-  Mutation *mut;
+  MDRequestRef mdr;
+  MutationRef mut;
   CInode *in;
-  C_MDC_snaprealm_create_finish(MDCache *c, MDRequest *m, Mutation *mu, CInode *i) : 
+  C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
+                                MutationRef& mu, CInode *i) :
     cache(c), mdr(m), mut(mu), in(i) {}
   void finish(int r) {
     cache->_snaprealm_create_finish(mdr, mut, in);
   }
 };
 
-void MDCache::snaprealm_create(MDRequest *mdr, CInode *in)
+void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
 {
   dout(10) << "snaprealm_create " << *in << dendl;
   assert(!in->snaprealm);
@@ -9290,7 +9302,7 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in)
     return;
   }
 
-  Mutation *mut = new Mutation;
+  MutationRef mut(new MutationImpl);
   mut->ls = mds->mdlog->get_current_segment();
   EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
   mds->mdlog->start_entry(le);
@@ -9376,21 +9388,19 @@ void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool
     send_snaps(updates);
 }
 
-void MDCache::_snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in)
+void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
 {
   dout(10) << "_snaprealm_create_finish " << *in << dendl;
 
   // apply
   in->pop_and_dirty_projected_inode(mut->ls);
   mut->apply();
-  mds->locker->drop_locks(mut);
+  mds->locker->drop_locks(mut.get());
   mut->cleanup();
 
   // tell table we've committed
   mds->snapclient->commit(mdr->more()->stid, mut->ls);
 
-  delete mut;
-
   // create
   bufferlist::iterator p = mdr->more()->snapidbl.begin();
   snapid_t seq;
@@ -10769,7 +10779,8 @@ void MDCache::handle_dir_update(MDirUpdate *m)
       CInode *in;
       filepath path = m->get_path();
       dout(5) << "trying discover on dir_update for " << path << dendl;
-      int r = path_traverse(NULL, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
+      MDRequestRef null_ref;
+      int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
       if (r > 0)
         return;
       assert(r == 0);
@@ -10876,7 +10887,7 @@ void MDCache::handle_dentry_link(MDentryLink *m)
 
 // UNLINK
 
-void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequest *mdr)
+void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
 {
   dout(10) << "send_dentry_unlink " << *dn << dendl;
   // share unlink news with replicas
@@ -11456,9 +11467,9 @@ void MDCache::find_stale_fragment_freeze()
 
 class C_MDC_FragmentPrep : public Context {
   MDCache *mdcache;
-  MDRequest *mdr;
+  MDRequestRef mdr;
 public:
-  C_MDC_FragmentPrep(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+  C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : mdcache(m), mdr(r) {}
   virtual void finish(int r) {
     mdcache->_fragment_logged(mdr);
   }
@@ -11466,9 +11477,9 @@ public:
 
 class C_MDC_FragmentStore : public Context {
   MDCache *mdcache;
-  MDRequest *mdr;
+  MDRequestRef mdr;
 public:
-  C_MDC_FragmentStore(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+  C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : mdcache(m), mdr(r) {}
   virtual void finish(int r) {
     mdcache->_fragment_stored(mdr);
   }
@@ -11517,12 +11528,12 @@ void MDCache::fragment_frozen(dirfrag_t basedirfrag, int r)
 
   info.has_frozen = true;
 
-  MDRequest *mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
   mdr->more()->fragment_base = basedirfrag;
   dispatch_fragment_dir(mdr);
 }
 
-void MDCache::dispatch_fragment_dir(MDRequest *mdr)
+void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
   map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
@@ -11595,7 +11606,7 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
     // journal dirfragtree
     inode_t *pi = diri->project_inode();
     pi->version = diri->pre_dirty();
-    journal_dirty_inode(mdr, &le->metablob, diri);
+    journal_dirty_inode(mdr.get(), &le->metablob, diri);
   } else {
     mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
     mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
@@ -11619,7 +11630,7 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
   mds->mdlog->flush();
 }
 
-void MDCache::_fragment_logged(MDRequest *mdr)
+void MDCache::_fragment_logged(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
   map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
@@ -11653,7 +11664,7 @@ void MDCache::_fragment_logged(MDRequest *mdr)
   gather.activate();
 }
 
-void MDCache::_fragment_stored(MDRequest *mdr)
+void MDCache::_fragment_stored(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
   map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
@@ -11690,7 +11701,7 @@ void MDCache::_fragment_stored(MDRequest *mdr)
   mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
 							      info.resultfrags));
 
-  mds->locker->drop_locks(mdr);
+  mds->locker->drop_locks(mdr.get());
 
   // unfreeze resulting frags
   for (list<CDir*>::iterator p = info.resultfrags.begin();
@@ -12198,15 +12209,12 @@ void MDCache::dump_cache(const char *fn)
 
 
 
-C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequest *r)
+C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
   : cache(c), mdr(r)
-{
-  mdr->get();
-}
+{}
 
 void C_MDS_RetryRequest::finish(int r)
 {
   mdr->retry++;
   cache->dispatch_request(mdr);
-  mdr->put();
 }
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 422b94d..c536d32 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -66,8 +66,8 @@ class MMDSFragmentNotify;
 
 class ESubtreeMap;
 
-struct Mutation;
-struct MDRequest;
+struct MDRequestImpl;
+typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
 struct MDSlaveUpdate;
 
 
@@ -204,7 +204,7 @@ public:
   void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
   void try_subtree_merge(CDir *root);
   void try_subtree_merge_at(CDir *root, bool do_eval=true);
-  void subtree_merge_writebehind_finish(CInode *in, Mutation *mut);
+  void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
   void eval_subtree_root(CInode *diri);
   CDir *get_subtree_root(CDir *dir);
   CDir *get_projected_subtree_root(CDir *dir);
@@ -240,37 +240,38 @@ protected:
 
   // -- requests --
 protected:
-  ceph::unordered_map<metareqid_t, MDRequest*> active_requests; 
+  ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
 
 public:
   int get_num_client_requests();
 
-  MDRequest* request_start(MClientRequest *req);
-  MDRequest* request_start_slave(metareqid_t rid, __u32 attempt, int by);
-  MDRequest* request_start_internal(int op);
+  MDRequestRef request_start(MClientRequest *req);
+  MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, int by);
+  MDRequestRef request_start_internal(int op);
   bool have_request(metareqid_t rid) {
     return active_requests.count(rid);
   }
-  MDRequest* request_get(metareqid_t rid);
-  void request_pin_ref(MDRequest *r, CInode *ref, vector<CDentry*>& trace);
-  void request_finish(MDRequest *mdr);
-  void request_forward(MDRequest *mdr, int mds, int port=0);
-  void dispatch_request(MDRequest *mdr);
-  void request_drop_foreign_locks(MDRequest *mdr);
-  void request_drop_non_rdlocks(MDRequest *r);
-  void request_drop_locks(MDRequest *r);
-  void request_cleanup(MDRequest *r);
+  MDRequestRef request_get(metareqid_t rid);
+  void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
+  void request_finish(MDRequestRef& mdr);
+  void request_forward(MDRequestRef& mdr, int mds, int port=0);
+  void dispatch_request(MDRequestRef& mdr);
+  void request_drop_foreign_locks(MDRequestRef& mdr);
+  void request_drop_non_rdlocks(MDRequestRef& r);
+  void request_drop_locks(MDRequestRef& r);
+  void request_cleanup(MDRequestRef& r);
   
-  void request_kill(MDRequest *r);  // called when session closes
+  void request_kill(MDRequestRef& r);  // called when session closes
 
   // journal/snap helpers
   CInode *pick_inode_snap(CInode *in, snapid_t follows);
   CInode *cow_inode(CInode *in, snapid_t last);
-  void journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn, snapid_t follows=CEPH_NOSNAP,
+  void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
+                          snapid_t follows=CEPH_NOSNAP,
 			  CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
-  void journal_cow_inode(Mutation *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
+  void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
 			  CInode **pcow_inode=0);
-  void journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
+  void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
 
   void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first, int linkunlink);
   void _project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
@@ -278,7 +279,7 @@ public:
   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
 				   snapid_t ofirst, snapid_t last, 
 				   CInode *pin, bool cow_head);
-  void predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
+  void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 				CInode *in, CDir *parent,
 				int flags, int linkunlink=0,
 				snapid_t follows=CEPH_NOSNAP);
@@ -538,7 +539,7 @@ public:
 
   void queue_file_recover(CInode *in);
   void unqueue_file_recover(CInode *in);
-  void _queued_file_recover_cow(CInode *in, Mutation *mut);
+  void _queued_file_recover_cow(CInode *in, MutationRef& mut);
   void _queue_file_recover(CInode *in);
   void identify_files_to_recover(vector<CInode*>& recover_q, vector<CInode*>& check_q);
   void start_files_to_recover(vector<CInode*>& recover_q, vector<CInode*>& check_q);
@@ -688,7 +689,7 @@ protected:
   void truncate_inode(CInode *in, LogSegment *ls);
   void _truncate_inode(CInode *in, LogSegment *ls);
   void truncate_inode_finish(CInode *in, LogSegment *ls);
-  void truncate_inode_logged(CInode *in, Mutation *mut);
+  void truncate_inode_logged(CInode *in, MutationRef& mut);
 
   void add_recovered_truncate(CInode *in, LogSegment *ls);
   void remove_recovered_truncate(CInode *in, LogSegment *ls);
@@ -725,12 +726,13 @@ public:
   void populate_mydir();
 
   void _create_system_file(CDir *dir, const char *name, CInode *in, Context *fin);
-  void _create_system_file_finish(Mutation *mut, CDentry *dn, version_t dpv, Context *fin);
+  void _create_system_file_finish(MutationRef& mut, CDentry *dn,
+                                  version_t dpv, Context *fin);
 
   void open_foreign_mdsdir(inodeno_t ino, Context *c);
   CDentry *get_or_create_stray_dentry(CInode *in);
 
-  Context *_get_waiter(MDRequest *mdr, Message *req, Context *fin);
+  Context *_get_waiter(MDRequestRef& mdr, Message *req, Context *fin);
 
   /**
    * Find the given dentry (and whether it exists or not), its ancestors,
@@ -766,7 +768,7 @@ public:
    * If it returns 2 the request has been forwarded, and again the requester
    * should unwind itself and back out.
    */
-  int path_traverse(MDRequest *mdr, Message *req, Context *fin, const filepath& path,
+  int path_traverse(MDRequestRef& mdr, Message *req, Context *fin, const filepath& path,
 		    vector<CDentry*> *pdnvec, CInode **pin, int onfail);
   bool path_is_mine(filepath& path);
   bool path_is_mine(string& p) {
@@ -777,7 +779,7 @@ public:
   CInode *cache_traverse(const filepath& path);
 
   void open_remote_dirfrag(CInode *diri, frag_t fg, Context *fin);
-  CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr, bool projected=false);
+  CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
   void open_remote_ino(inodeno_t ino, Context *fin, bool want_xlocked=false,
 		       inodeno_t hadino=0, version_t hadv=0);
   void open_remote_ino_2(inodeno_t ino,
@@ -861,20 +863,20 @@ public:
 
   // -- anchors --
 public:
-  void anchor_create_prep_locks(MDRequest *mdr, CInode *in, set<SimpleLock*>& rdlocks,
+  void anchor_create_prep_locks(MDRequestRef& mdr, CInode *in, set<SimpleLock*>& rdlocks,
 				set<SimpleLock*>& xlocks);
-  void anchor_create(MDRequest *mdr, CInode *in, Context *onfinish);
+  void anchor_create(MDRequestRef& mdr, CInode *in, Context *onfinish);
   void anchor_destroy(CInode *in, Context *onfinish);
 protected:
   void _anchor_prepared(CInode *in, version_t atid, bool add);
-  void _anchor_logged(CInode *in, version_t atid, Mutation *mut);
+  void _anchor_logged(CInode *in, version_t atid, MutationRef& mut);
   friend class C_MDC_AnchorPrepared;
   friend class C_MDC_AnchorLogged;
 
   // -- snaprealms --
 public:
-  void snaprealm_create(MDRequest *mdr, CInode *in);
-  void _snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in);
+  void snaprealm_create(MDRequestRef& mdr, CInode *in);
+  void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in);
 
   // -- stray --
 public:
@@ -946,7 +948,7 @@ public:
   // -- namespace --
 public:
   void send_dentry_link(CDentry *dn);
-  void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequest *mdr);
+  void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
 protected:
   void handle_dentry_link(MDentryLink *m);
   void handle_dentry_unlink(MDentryUnlink *m);
@@ -969,7 +971,7 @@ private:
     int bits;
     list<CDir*> dirs;
     list<CDir*> resultfrags;
-    MDRequest *mdr;
+    MDRequestRef mdr;
     // for deadlock detection
     bool has_frozen;
     utime_t last_cum_auth_pins_change;
@@ -995,9 +997,9 @@ private:
   void fragment_mark_and_complete(list<CDir*>& dirs);
   void fragment_frozen(dirfrag_t basedirfrag, int r);
   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
-  void dispatch_fragment_dir(MDRequest *mdr);
-  void _fragment_logged(MDRequest *mdr);
-  void _fragment_stored(MDRequest *mdr);
+  void dispatch_fragment_dir(MDRequestRef& mdr);
+  void _fragment_logged(MDRequestRef& mdr);
+  void _fragment_stored(MDRequestRef& mdr);
   void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
   void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
 
@@ -1060,9 +1062,9 @@ public:
 
 class C_MDS_RetryRequest : public Context {
   MDCache *cache;
-  MDRequest *mdr;
+  MDRequestRef mdr;
  public:
-  C_MDS_RetryRequest(MDCache *c, MDRequest *r);
+  C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
   virtual void finish(int r);
 };
 
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 991eaf5..c224695 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -333,9 +333,14 @@ void MDLog::trim(int m)
     if (stop < ceph_clock_now(g_ceph_context))
       break;
 
-    if ((int)expiring_segments.size() >= g_conf->mds_log_max_expiring)
+    int num_expiring_segments = (int)expiring_segments.size();
+    if (num_expiring_segments >= g_conf->mds_log_max_expiring)
       break;
 
+    int op_prio = CEPH_MSG_PRIO_LOW +
+		  (CEPH_MSG_PRIO_HIGH - CEPH_MSG_PRIO_LOW) *
+		  num_expiring_segments / g_conf->mds_log_max_expiring;
+
     // look at first segment
     LogSegment *ls = p->second;
     assert(ls);
@@ -351,7 +356,7 @@ void MDLog::trim(int m)
     } else if (expired_segments.count(ls)) {
       dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl;
     } else {
-      try_expire(ls);
+      try_expire(ls, op_prio);
     }
   }
 
@@ -360,16 +365,16 @@ void MDLog::trim(int m)
 }
 
 
-void MDLog::try_expire(LogSegment *ls)
+void MDLog::try_expire(LogSegment *ls, int op_prio)
 {
   C_GatherBuilder gather_bld(g_ceph_context);
-  ls->try_to_expire(mds, gather_bld);
+  ls->try_to_expire(mds, gather_bld, op_prio);
   if (gather_bld.has_subs()) {
     assert(expiring_segments.count(ls) == 0);
     expiring_segments.insert(ls);
     expiring_events += ls->num_events;
     dout(5) << "try_expire expiring segment " << ls->offset << dendl;
-    gather_bld.set_finisher(new C_MaybeExpiredSegment(this, ls));
+    gather_bld.set_finisher(new C_MaybeExpiredSegment(this, ls, op_prio));
     gather_bld.activate();
   } else {
     dout(10) << "try_expire expired segment " << ls->offset << dendl;
@@ -380,13 +385,13 @@ void MDLog::try_expire(LogSegment *ls)
   logger->set(l_mdl_evexg, expiring_events);
 }
 
-void MDLog::_maybe_expired(LogSegment *ls) 
+void MDLog::_maybe_expired(LogSegment *ls, int op_prio)
 {
   dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl;
   assert(expiring_segments.count(ls));
   expiring_segments.erase(ls);
   expiring_events -= ls->num_events;
-  try_expire(ls);
+  try_expire(ls, op_prio);
 }
 
 void MDLog::_trim_expired_segments()
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
index 82d51c3..6e8e980 100644
--- a/src/mds/MDLog.h
+++ b/src/mds/MDLog.h
@@ -219,15 +219,16 @@ private:
   class C_MaybeExpiredSegment : public Context {
     MDLog *mdlog;
     LogSegment *ls;
+    int op_prio;
   public:
-    C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {}
+    C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s, int p) : mdlog(mdl), ls(s), op_prio(p) {}
     void finish(int res) {
-      mdlog->_maybe_expired(ls);
+      mdlog->_maybe_expired(ls, op_prio);
     }
   };
 
-  void try_expire(LogSegment *ls);
-  void _maybe_expired(LogSegment *ls);
+  void try_expire(LogSegment *ls, int op_prio);
+  void _maybe_expired(LogSegment *ls, int op_prio);
   void _expired(LogSegment *ls);
   void _trim_expired_segments();
 
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 2473ff0..1d9015a 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -749,6 +749,9 @@ void MDS::handle_command(MMonCommand *m)
   else if (m->cmd[0] == "exit") {
     suicide();
   }
+  else if (m->cmd[0] == "respawn") {
+    respawn();
+  }
   else if (m->cmd[0] == "session" && m->cmd[1] == "kill") {
     Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
 							    strtol(m->cmd[2].c_str(), 0, 10)));
@@ -994,9 +997,10 @@ void MDS::handle_mds_map(MMDSMap *m)
     } else {
       // did i just recover?
       if ((is_active() || is_clientreplay()) &&
-          (oldstate == MDSMap::STATE_REJOIN ||
+          (oldstate == MDSMap::STATE_CREATING ||
+	   oldstate == MDSMap::STATE_REJOIN ||
 	   oldstate == MDSMap::STATE_RECONNECT))
-        recovery_done();
+        recovery_done(oldstate);
 
       if (is_active()) {
         active_start();
@@ -1199,12 +1203,10 @@ void MDS::boot_create()
     dout(10) << "boot_create creating fresh anchortable" << dendl;
     anchorserver->reset();
     anchorserver->save(fin.new_sub());
-    anchorserver->handle_mds_recovery(whoami);
 
     dout(10) << "boot_create creating fresh snaptable" << dendl;
     snapserver->reset();
     snapserver->save(fin.new_sub());
-    snapserver->handle_mds_recovery(whoami);
   }
 
   assert(g_conf->mds_kill_create_at != 1);
@@ -1564,7 +1566,7 @@ void MDS::active_start()
   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
 }
 
-void MDS::recovery_done()
+void MDS::recovery_done(int oldstate)
 {
   dout(1) << "recovery_done -- successful recovery!" << dendl;
   assert(is_clientreplay() || is_active());
@@ -1579,6 +1581,9 @@ void MDS::recovery_done()
     snapserver->finish_recovery(active);
   }
 
+  if (oldstate == MDSMap::STATE_CREATING)
+    return;
+
   mdcache->start_recovered_truncates();
   mdcache->do_file_recover();
 
@@ -1694,13 +1699,25 @@ void MDS::respawn()
   }
   new_argv[orig_argc] = NULL;
 
-  char buf[PATH_MAX];
-  char *cwd = getcwd(buf, sizeof(buf));
-  assert(cwd);
-  dout(1) << " cwd " << cwd << dendl;
+  /* Determine the path to our executable, try to read
+   * linux-specific /proc/ path first */
+  char exe_path[PATH_MAX];
+  ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path, sizeof(exe_path));
+  if (exe_path_bytes == -1) {
+    /* Print CWD for the user's interest */
+    char buf[PATH_MAX];
+    char *cwd = getcwd(buf, sizeof(buf));
+    assert(cwd);
+    dout(1) << " cwd " << cwd << dendl;
+
+    /* Fall back to a best-effort: just running in our CWD */
+    strncpy(exe_path, orig_argv[0], sizeof(exe_path) - 1);
+  }
+
+  dout(1) << " exe_path " << exe_path << dendl;
 
   unblock_all_signals(NULL);
-  execv(orig_argv[0], new_argv);
+  execv(exe_path, new_argv);
 
   dout(0) << "respawn execv " << orig_argv[0]
 	  << " failed with " << cpp_strerror(errno) << dendl;
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index fc1cfcd..1cd6096 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -35,7 +35,7 @@
 #include "SessionMap.h"
 
 
-#define CEPH_MDS_PROTOCOL    22 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    23 /* cluster internal */
 
 
 enum {
@@ -389,7 +389,7 @@ class MDS : public Dispatcher {
   void rejoin_joint_start();
   void rejoin_start();
   void rejoin_done();
-  void recovery_done();
+  void recovery_done(int oldstate);
   void clientreplay_start();
   void clientreplay_done();
   void active_start();
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 08cf06c..cde0f70 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -325,15 +325,15 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
     mds->queue_waiters(it->second.waiting_for_finish);
     // drop locks
     if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
-      MDRequest *mdr = dynamic_cast<MDRequest*>(it->second.mut);
+      MDRequestRef mdr = ceph::static_pointer_cast<MDRequestImpl,
+						   MutationImpl>(it->second.mut);
       assert(mdr);
       if (mdr->more()->waiting_on_slave.empty())
 	mds->mdcache->request_finish(mdr);
     } else if (it->second.mut) {
-      Mutation *mut = it->second.mut;
-      mds->locker->drop_locks(mut);
+      MutationRef& mut = it->second.mut;
+      mds->locker->drop_locks(mut.get());
       mut->cleanup();
-      delete mut;
     }
 
     export_state.erase(it);
@@ -745,7 +745,7 @@ void Migrator::export_dir(CDir *dir, int dest)
   dir->auth_pin(this);
   dir->state_set(CDir::STATE_EXPORTING);
 
-  MDRequest *mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
+  MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
   mdr->more()->export_dir = dir;
 
   assert(export_state.count(dir) == 0);
@@ -758,7 +758,7 @@ void Migrator::export_dir(CDir *dir, int dest)
   dispatch_export_dir(mdr);
 }
 
-void Migrator::dispatch_export_dir(MDRequest *mdr)
+void Migrator::dispatch_export_dir(MDRequestRef& mdr)
 {
   dout(7) << "dispatch_export_dir " << *mdr << dendl;
   CDir *dir = mdr->more()->export_dir;
@@ -845,10 +845,11 @@ void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
   } else {
     assert(it->second.state == EXPORT_DISCOVERING);
     // release locks to avoid deadlock
-    MDRequest *mdr = dynamic_cast<MDRequest*>(it->second.mut);
+    MDRequestRef mdr = ceph::static_pointer_cast<MDRequestImpl,
+						 MutationImpl>(it->second.mut);
     assert(mdr);
     mds->mdcache->request_finish(mdr);
-    it->second.mut = NULL;
+    it->second.mut.reset();
     // freeze the subtree
     it->second.state = EXPORT_FREEZING;
     dir->auth_unpin(this);
@@ -921,7 +922,7 @@ void Migrator::export_frozen(CDir *dir)
     return;
   }
 
-  it->second.mut = new Mutation;
+  it->second.mut = MutationRef(new MutationImpl);
   if (diri->is_auth())
     it->second.mut->auth_pin(diri);
   mds->locker->rdlock_take_set(rdlocks, it->second.mut);
@@ -1800,11 +1801,10 @@ void Migrator::export_finish(CDir *dir)
   mds->queue_waiters(it->second.waiting_for_finish);
 
   // unpin path
-  Mutation *mut = it->second.mut;
+  MutationRef& mut = it->second.mut;
   if (mut) {
-    mds->locker->drop_locks(mut);
+    mds->locker->drop_locks(mut.get());
     mut->cleanup();
-    delete mut;
   }
 
   export_state.erase(it);
@@ -1871,7 +1871,8 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
     // must discover it!
     filepath fpath(m->get_path());
     vector<CDentry*> trace;
-    int r = cache->path_traverse(NULL, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
+    MDRequestRef null_ref;
+    int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
     if (r > 0) return;
     if (r < 0) {
       dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
@@ -2117,7 +2118,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
   bool success = true;
   if (dir->get_inode()->filelock.can_wrlock(-1) &&
       dir->get_inode()->nestlock.can_wrlock(-1)) {
-    it->second.mut = new Mutation;
+    it->second.mut = MutationRef(new MutationImpl);
     // force some locks.  hacky.
     mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
     mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
@@ -2472,9 +2473,8 @@ void Migrator::import_reverse_final(CDir *dir)
   // clean up
   map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
   if (it->second.mut) {
-    mds->locker->drop_locks(it->second.mut);
+    mds->locker->drop_locks(it->second.mut.get());
     it->second.mut->cleanup();
-    delete it->second.mut;
   }
   import_state.erase(it);
 
@@ -2574,7 +2574,6 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
 	assert(session);
 	Capability *cap = in->get_client_cap(q->first);
 	assert(cap);
-	cap->clear_new();
 	cap->merge(q->second, true);
 	mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
 				    q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
@@ -2618,7 +2617,7 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
   it->second.peer_exports.swap(peer_exports);
 
   // clear import state (we're done!)
-  Mutation *mut = it->second.mut;
+  MutationRef mut = it->second.mut;
   import_state.erase(it);
 
   mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
@@ -2636,9 +2635,8 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
   //audit();  // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
 
   if (mut) {
-    mds->locker->drop_locks(mut);
+    mds->locker->drop_locks(mut.get());
     mut->cleanup();
-    delete mut;
   }
 
   // re-eval imported caps
@@ -2994,10 +2992,10 @@ void Migrator::handle_export_caps(MExportCaps *ex)
   
   assert(in);
   assert(in->is_auth());
-  /*
-   * note: i may be frozen, but i won't have been encoded for export (yet)!
-   *  see export_go() vs export_go_synced().
-   */
+
+  // FIXME
+  if (in->is_frozen())
+    return;
 
   C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(this, in, ex->get_source().num());
   finish->client_map = ex->client_map;
@@ -3027,6 +3025,8 @@ void Migrator::logged_import_caps(CInode *in,
 				  map<client_t,uint64_t>& sseqmap) 
 {
   dout(10) << "logged_import_caps on " << *in << dendl;
+  // see export_go() vs export_go_synced()
+  assert(in->is_auth());
 
   // force open client sessions and finish cap import
   mds->server->finish_force_open_sessions(client_map, sseqmap);
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index d3a2129..0819c82 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -48,8 +48,6 @@ class MExportCapsAck;
 
 class EImportStart;
 
-struct Mutation;
-
 class Migrator {
 private:
   MDS *mds;
@@ -91,12 +89,12 @@ protected:
     set<int> notify_ack_waiting;
     map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
     list<Context*> waiting_for_finish;
-    Mutation *mut;
+    MutationRef mut;
     // for freeze tree deadlock detection
     utime_t last_cum_auth_pins_change;
     int last_cum_auth_pins;
     int num_remote_waiters; // number of remote authpin waiters
-    export_state_t() : state(0), peer(0), tid(0), mut(NULL), 
+    export_state_t() : state(0), peer(0), tid(0), mut(), 
 		       last_cum_auth_pins(0), num_remote_waiters(0) {}
   };
 
@@ -140,8 +138,8 @@ protected:
     list<ScatterLock*> updated_scatterlocks;
     map<client_t,entity_inst_t> client_map;
     map<CInode*, map<client_t,Capability::Export> > peer_exports;
-    Mutation *mut;
-    import_state_t() : state(0), peer(0), tid(0), mut(NULL) {}
+    MutationRef mut;
+    import_state_t() : state(0), peer(0), tid(0), mut() {}
   };
 
   map<dirfrag_t, import_state_t>  import_state;
@@ -228,7 +226,7 @@ public:
   // -- import/export --
   // exporter
  public:
-  void dispatch_export_dir(MDRequest *mdr);
+  void dispatch_export_dir(MDRequestRef& mdr);
   void export_dir(CDir *dir, int dest);
   void export_empty_import(CDir *dir);
 
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
index 0bf5475..ffeb066 100644
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -20,9 +20,9 @@
 #include "messages/MMDSSlaveRequest.h"
 
 
-// Mutation
+// MutationImpl
 
-void Mutation::pin(MDSCacheObject *o)
+void MutationImpl::pin(MDSCacheObject *o)
 {
   if (pins.count(o) == 0) {
     o->get(MDSCacheObject::PIN_REQUEST);
@@ -30,14 +30,14 @@ void Mutation::pin(MDSCacheObject *o)
   }      
 }
 
-void Mutation::unpin(MDSCacheObject *o)
+void MutationImpl::unpin(MDSCacheObject *o)
 {
   assert(pins.count(o));
   o->put(MDSCacheObject::PIN_REQUEST);
   pins.erase(o);
 }
 
-void Mutation::set_stickydirs(CInode *in)
+void MutationImpl::set_stickydirs(CInode *in)
 {
   if (stickydirs.count(in) == 0) {
     in->get_stickydirs();
@@ -45,7 +45,7 @@ void Mutation::set_stickydirs(CInode *in)
   }
 }
 
-void Mutation::drop_pins()
+void MutationImpl::drop_pins()
 {
   for (set<MDSCacheObject*>::iterator it = pins.begin();
        it != pins.end();
@@ -54,7 +54,7 @@ void Mutation::drop_pins()
   pins.clear();
 }
 
-void Mutation::start_locking(SimpleLock *lock, int target)
+void MutationImpl::start_locking(SimpleLock *lock, int target)
 {
   assert(locking == NULL);
   pin(lock->get_parent());
@@ -62,7 +62,7 @@ void Mutation::start_locking(SimpleLock *lock, int target)
   locking_target_mds = target;
 }
 
-void Mutation::finish_locking(SimpleLock *lock)
+void MutationImpl::finish_locking(SimpleLock *lock)
 {
   assert(locking == lock);
   locking = NULL;
@@ -71,12 +71,12 @@ void Mutation::finish_locking(SimpleLock *lock)
 
 
 // auth pins
-bool Mutation::is_auth_pinned(MDSCacheObject *object)
+bool MutationImpl::is_auth_pinned(MDSCacheObject *object)
 { 
   return auth_pins.count(object) || remote_auth_pins.count(object); 
 }
 
-void Mutation::auth_pin(MDSCacheObject *object)
+void MutationImpl::auth_pin(MDSCacheObject *object)
 {
   if (!is_auth_pinned(object)) {
     object->auth_pin(this);
@@ -84,14 +84,14 @@ void Mutation::auth_pin(MDSCacheObject *object)
   }
 }
 
-void Mutation::auth_unpin(MDSCacheObject *object)
+void MutationImpl::auth_unpin(MDSCacheObject *object)
 {
   assert(auth_pins.count(object));
   object->auth_unpin(this);
   auth_pins.erase(object);
 }
 
-void Mutation::drop_local_auth_pins()
+void MutationImpl::drop_local_auth_pins()
 {
   for (set<MDSCacheObject*>::iterator it = auth_pins.begin();
        it != auth_pins.end();
@@ -102,12 +102,12 @@ void Mutation::drop_local_auth_pins()
   auth_pins.clear();
 }
 
-void Mutation::add_projected_inode(CInode *in)
+void MutationImpl::add_projected_inode(CInode *in)
 {
   projected_inodes.push_back(in);
 }
 
-void Mutation::pop_and_dirty_projected_inodes()
+void MutationImpl::pop_and_dirty_projected_inodes()
 {
   while (!projected_inodes.empty()) {
     CInode *in = projected_inodes.front();
@@ -116,12 +116,12 @@ void Mutation::pop_and_dirty_projected_inodes()
   }
 }
 
-void Mutation::add_projected_fnode(CDir *dir)
+void MutationImpl::add_projected_fnode(CDir *dir)
 {
   projected_fnodes.push_back(dir);
 }
 
-void Mutation::pop_and_dirty_projected_fnodes()
+void MutationImpl::pop_and_dirty_projected_fnodes()
 {
   while (!projected_fnodes.empty()) {
     CDir *dir = projected_fnodes.front();
@@ -130,24 +130,24 @@ void Mutation::pop_and_dirty_projected_fnodes()
   }
 }
 
-void Mutation::add_updated_lock(ScatterLock *lock)
+void MutationImpl::add_updated_lock(ScatterLock *lock)
 {
   updated_locks.push_back(lock);
 }
 
-void Mutation::add_cow_inode(CInode *in)
+void MutationImpl::add_cow_inode(CInode *in)
 {
   pin(in);
   dirty_cow_inodes.push_back(in);
 }
 
-void Mutation::add_cow_dentry(CDentry *dn)
+void MutationImpl::add_cow_dentry(CDentry *dn)
 {
   pin(dn);
   dirty_cow_dentries.push_back(pair<CDentry*,version_t>(dn, dn->get_projected_version()));
 }
 
-void Mutation::apply()
+void MutationImpl::apply()
 {
   pop_and_dirty_projected_inodes();
   pop_and_dirty_projected_fnodes();
@@ -167,16 +167,16 @@ void Mutation::apply()
     (*p)->mark_dirty();
 }
 
-void Mutation::cleanup()
+void MutationImpl::cleanup()
 {
   drop_local_auth_pins();
   drop_pins();
 }
 
 
-// MDRequest
+// MDRequestImpl
 
-MDRequest::~MDRequest()
+MDRequestImpl::~MDRequestImpl()
 {
   if (client_request)
     client_request->put();
@@ -185,34 +185,34 @@ MDRequest::~MDRequest()
   delete _more;
 }
 
-MDRequest::More* MDRequest::more()
+MDRequestImpl::More* MDRequestImpl::more()
 { 
   if (!_more)
     _more = new More();
   return _more;
 }
 
-bool MDRequest::has_more()
+bool MDRequestImpl::has_more()
 {
   return _more;
 }
 
-bool MDRequest::are_slaves()
+bool MDRequestImpl::are_slaves()
 {
   return _more && !_more->slaves.empty();
 }
 
-bool MDRequest::slave_did_prepare()
+bool MDRequestImpl::slave_did_prepare()
 {
   return more()->slave_commit;
 }
 
-bool MDRequest::did_ino_allocation()
+bool MDRequestImpl::did_ino_allocation()
 {
   return alloc_ino || used_prealloc_ino || prealloc_inos.size();
 }      
 
-bool MDRequest::freeze_auth_pin(CInode *inode)
+bool MDRequestImpl::freeze_auth_pin(CInode *inode)
 {
   assert(!more()->rename_inode || more()->rename_inode == inode);
   more()->rename_inode = inode;
@@ -226,7 +226,7 @@ bool MDRequest::freeze_auth_pin(CInode *inode)
   return true;
 }
 
-void MDRequest::unfreeze_auth_pin(bool clear_inode)
+void MDRequestImpl::unfreeze_auth_pin(bool clear_inode)
 {
   assert(more()->is_freeze_authpin);
   CInode *inode = more()->rename_inode;
@@ -239,14 +239,13 @@ void MDRequest::unfreeze_auth_pin(bool clear_inode)
     more()->rename_inode = NULL;
 }
 
-void MDRequest::set_remote_frozen_auth_pin(CInode *inode)
+void MDRequestImpl::set_remote_frozen_auth_pin(CInode *inode)
 {
-  assert(!more()->rename_inode || more()->rename_inode == inode);
   more()->rename_inode = inode;
   more()->is_remote_frozen_authpin = true;
 }
 
-void MDRequest::set_ambiguous_auth(CInode *inode)
+void MDRequestImpl::set_ambiguous_auth(CInode *inode)
 {
   assert(!more()->rename_inode || more()->rename_inode == inode);
   assert(!more()->is_ambiguous_auth);
@@ -256,7 +255,7 @@ void MDRequest::set_ambiguous_auth(CInode *inode)
   more()->is_ambiguous_auth = true;
 }
 
-void MDRequest::clear_ambiguous_auth()
+void MDRequestImpl::clear_ambiguous_auth()
 {
   CInode *inode = more()->rename_inode;
   assert(inode && more()->is_ambiguous_auth);
@@ -264,7 +263,7 @@ void MDRequest::clear_ambiguous_auth()
   more()->is_ambiguous_auth = false;
 }
 
-bool MDRequest::can_auth_pin(MDSCacheObject *object)
+bool MDRequestImpl::can_auth_pin(MDSCacheObject *object)
 {
   return object->can_auth_pin() ||
          (is_auth_pinned(object) && has_more() &&
@@ -272,14 +271,14 @@ bool MDRequest::can_auth_pin(MDSCacheObject *object)
 	  more()->rename_inode == object);
 }
 
-void MDRequest::drop_local_auth_pins()
+void MDRequestImpl::drop_local_auth_pins()
 {
   if (has_more() && more()->is_freeze_authpin)
     unfreeze_auth_pin(true);
-  Mutation::drop_local_auth_pins();
+  MutationImpl::drop_local_auth_pins();
 }
 
-void MDRequest::print(ostream &out)
+void MDRequestImpl::print(ostream &out)
 {
   out << "request(" << reqid;
   //if (request) out << " " << *request;
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index 206b71a..dbfbe75 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -33,7 +33,7 @@ class ScatterLock;
 class MClientRequest;
 class MMDSSlaveRequest;
 
-struct Mutation {
+struct MutationImpl {
   metareqid_t reqid;
   __u32 attempt;      // which attempt for this request
   LogSegment *ls;  // the log segment i'm committing to
@@ -78,21 +78,21 @@ struct Mutation {
   list<CInode*> dirty_cow_inodes;
   list<pair<CDentry*,version_t> > dirty_cow_dentries;
 
-  Mutation()
+  MutationImpl()
     : attempt(0),
       ls(0),
       slave_to_mds(-1),
       locking(NULL),
       locking_target_mds(-1),
       done_locking(false), committing(false), aborted(false), killed(false) { }
-  Mutation(metareqid_t ri, __u32 att=0, int slave_to=-1)
+  MutationImpl(metareqid_t ri, __u32 att=0, int slave_to=-1)
     : reqid(ri), attempt(att),
       ls(0),
       slave_to_mds(slave_to), 
       locking(NULL),
       locking_target_mds(-1),
       done_locking(false), committing(false), aborted(false), killed(false) { }
-  virtual ~Mutation() {
+  virtual ~MutationImpl() {
     assert(locking == NULL);
     assert(pins.empty());
     assert(auth_pins.empty());
@@ -140,13 +140,13 @@ struct Mutation {
   }
 };
 
-inline ostream& operator<<(ostream& out, Mutation &mut)
+inline ostream& operator<<(ostream& out, MutationImpl &mut)
 {
   mut.print(out);
   return out;
 }
 
-
+typedef ceph::shared_ptr<MutationImpl> MutationRef;
 
 
 
@@ -155,10 +155,9 @@ inline ostream& operator<<(ostream& out, Mutation &mut)
  * mostly information about locks held, so that we can drop them all
  * the request is finished or forwarded.  see request_*().
  */
-struct MDRequest : public Mutation {
-  int ref;
+struct MDRequestImpl : public MutationImpl {
   Session *session;
-  elist<MDRequest*>::item item_session_request;  // if not on list, op is aborted.
+  elist<MDRequestImpl*>::item item_session_request;  // if not on list, op is aborted.
 
   // -- i am a client (master) request
   MClientRequest *client_request; // client request (if any)
@@ -251,8 +250,7 @@ struct MDRequest : public Mutation {
 
 
   // ---------------------------------------------------
-  MDRequest() : 
-    ref(1),
+  MDRequestImpl() :
     session(0), item_session_request(this),
     client_request(0), straydn(NULL), snapid(CEPH_NOSNAP), tracei(0), tracedn(0),
     alloc_ino(0), used_prealloc_ino(0), snap_caps(0), did_early_reply(false),
@@ -265,9 +263,8 @@ struct MDRequest : public Mutation {
     _more(0) {
     in[0] = in[1] = 0; 
   }
-  MDRequest(metareqid_t ri, __u32 attempt, MClientRequest *req) : 
-    Mutation(ri, attempt),
-    ref(1),
+  MDRequestImpl(metareqid_t ri, __u32 attempt, MClientRequest *req) :
+    MutationImpl(ri, attempt),
     session(0), item_session_request(this),
     client_request(req), straydn(NULL), snapid(CEPH_NOSNAP), tracei(0), tracedn(0),
     alloc_ino(0), used_prealloc_ino(0), snap_caps(0), did_early_reply(false),
@@ -280,9 +277,8 @@ struct MDRequest : public Mutation {
     _more(0) {
     in[0] = in[1] = 0; 
   }
-  MDRequest(metareqid_t ri, __u32 attempt, int by) : 
-    Mutation(ri, attempt, by),
-    ref(1),
+  MDRequestImpl(metareqid_t ri, __u32 attempt, int by) :
+    MutationImpl(ri, attempt, by),
     session(0), item_session_request(this),
     client_request(0), straydn(NULL), snapid(CEPH_NOSNAP), tracei(0), tracedn(0),
     alloc_ino(0), used_prealloc_ino(0), snap_caps(0), did_early_reply(false),
@@ -295,16 +291,7 @@ struct MDRequest : public Mutation {
     _more(0) {
     in[0] = in[1] = 0; 
   }
-  ~MDRequest();
-
-  MDRequest *get() {
-    ++ref;
-    return this;
-  }
-  void put() {
-    if (--ref == 0)
-      delete this;
-  }
+  ~MDRequestImpl();
   
   More* more();
   bool has_more();
@@ -322,6 +309,8 @@ struct MDRequest : public Mutation {
   void print(ostream &out);
 };
 
+typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
+
 
 struct MDSlaveUpdate {
   int origop;
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 984ad29..64004b2 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -53,6 +53,7 @@
 #include "events/ECommitted.h"
 
 #include "include/filepath.h"
+#include "common/errno.h"
 #include "common/Timer.h"
 #include "common/perf_counters.h"
 #include "include/compat.h"
@@ -544,10 +545,11 @@ void Server::journal_close_session(Session *session, int state)
   mdlog->flush();
 
   // clean up requests, too
-  elist<MDRequest*>::iterator p = session->requests.begin(member_offset(MDRequest,
-									item_session_request));
+  elist<MDRequestImpl*>::iterator p =
+    session->requests.begin(member_offset(MDRequestImpl,
+					  item_session_request));
   while (!p.end()) {
-    MDRequest *mdr = *p;
+    MDRequestRef mdr = mdcache->request_get((*p)->reqid);
     ++p;
     mdcache->request_kill(mdr);
   }
@@ -779,8 +781,7 @@ void Server::recall_client_state(float ratio)
 /*******
  * some generic stuff for finishing off requests
  */
-/* This function takes responsibility for the passed mdr*/
-void Server::journal_and_reply(MDRequest *mdr, CInode *in, CDentry *dn, LogEvent *le, Context *fin)
+void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, Context *fin)
 {
   dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
 
@@ -806,7 +807,7 @@ void Server::journal_and_reply(MDRequest *mdr, CInode *in, CDentry *dn, LogEvent
       mdlog->flush();
     }
   } else if (mdr->did_early_reply)
-    mds->locker->drop_rdlocks(mdr);
+    mds->locker->drop_rdlocks(mdr.get());
   else
     mdlog->flush();
 }
@@ -814,12 +815,12 @@ void Server::journal_and_reply(MDRequest *mdr, CInode *in, CDentry *dn, LogEvent
 /*
  * send generic response (just an error code), clean up mdr
  */
-void Server::reply_request(MDRequest *mdr, int r, CInode *tracei, CDentry *tracedn)
+void Server::reply_request(MDRequestRef& mdr, int r, CInode *tracei, CDentry *tracedn)
 {
   reply_request(mdr, new MClientReply(mdr->client_request, r), tracei, tracedn);
 }
 
-void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn)
+void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
 {
   if (!g_conf->mds_early_reply)
     return;
@@ -855,11 +856,10 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn)
   //_rename_finish() does not send dentry link/unlink message to replicas.
   // so do not set xlocks on dentries "done", the xlocks prevent dentries
   // that have projected linkages from getting new replica.
-  mds->locker->set_xlocks_done(mdr, mdr->client_request->get_op() == CEPH_MDS_OP_RENAME);
+  mds->locker->set_xlocks_done(mdr.get(), mdr->client_request->get_op() == CEPH_MDS_OP_RENAME);
 
-  char buf[80];
   dout(10) << "early_reply " << reply->get_result() 
-	   << " (" << strerror_r(-reply->get_result(), buf, sizeof(buf))
+	   << " (" << cpp_strerror(reply->get_result())
 	   << ") " << *req << dendl;
 
   if (tracei || tracedn) {
@@ -889,14 +889,13 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn)
  * include a trace to tracei
  * Clean up mdr
  */
-void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei, CDentry *tracedn) 
+void Server::reply_request(MDRequestRef& mdr, MClientReply *reply, CInode *tracei, CDentry *tracedn)
 {
-  assert(mdr);
+  assert(mdr.get());
   MClientRequest *req = mdr->client_request;
   
-  char buf[80];
   dout(10) << "reply_request " << reply->get_result() 
-	   << " (" << strerror_r(-reply->get_result(), buf, sizeof(buf))
+	   << " (" << cpp_strerror(reply->get_result())
 	   << ") " << *req << dendl;
 
   // note successful request in session map?
@@ -963,8 +962,6 @@ void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei,
   
   // clean up request
   mdcache->request_finish(mdr);
-  mdr = 0;
-  req = 0;
 
   // take a closer look at tracei, if it happens to be a remote link
   if (tracei && 
@@ -1010,7 +1007,7 @@ void Server::set_trace_dist(Session *session, MClientReply *reply,
 			    CInode *in, CDentry *dn,
 			    snapid_t snapid,
 			    int dentry_wanted,
-			    MDRequest *mdr)
+			    MDRequestRef& mdr)
 {
   // skip doing this for debugging purposes?
   if (g_conf->mds_inject_traceless_reply_probability &&
@@ -1151,8 +1148,8 @@ void Server::handle_client_request(MClientRequest *req)
   }
 
   // register + dispatch
-  MDRequest *mdr = mdcache->request_start(req);
-  if (mdr) {
+  MDRequestRef mdr = mdcache->request_start(req);
+  if (mdr.get()) {
     if (session) {
       mdr->session = session;
       session->requests.push_back(&mdr->item_session_request);
@@ -1170,13 +1167,12 @@ void Server::handle_client_request(MClientRequest *req)
     req->releases.clear();
   }
 
-  if (mdr)
+  if (mdr.get())
     dispatch_client_request(mdr);
   return;
 }
 
-/* This function takes responsibility for the passed mdr*/
-void Server::dispatch_client_request(MDRequest *mdr)
+void Server::dispatch_client_request(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
 
@@ -1326,7 +1322,7 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
   }
 
   // am i a new slave?
-  MDRequest *mdr = NULL;
+  MDRequestRef mdr;
   if (mdcache->have_request(m->get_reqid())) {
     // existing?
     mdr = mdcache->request_get(m->get_reqid());
@@ -1345,14 +1341,14 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
       dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
 	       << ", closing out" << dendl;
       mdcache->request_finish(mdr);
-      mdr = NULL;
+      mdr.reset();
     } else if (mdr->slave_to_mds != from) {
       dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
       m->put();
       return;
     }
   }
-  if (!mdr) {
+  if (!mdr.get()) {
     // new?
     if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
       dout(10) << "missing slave request for " << m->get_reqid() 
@@ -1403,8 +1399,8 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
     return;
   }
 
-  MDRequest *mdr = mdcache->request_get(m->get_reqid());
-  if (!mdr) {
+  MDRequestRef mdr = mdcache->request_get(m->get_reqid());
+  if (!mdr.get()) {
     dout(10) << "handle_slave_request_reply ignoring reply from unknown reqid " << m->get_reqid() << dendl;
     m->put();
     return;
@@ -1483,7 +1479,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
 }
 
 /* This function DOES put the mdr->slave_request before returning*/
-void Server::dispatch_slave_request(MDRequest *mdr)
+void Server::dispatch_slave_request(MDRequestRef& mdr)
 {
   dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
 
@@ -1554,10 +1550,10 @@ void Server::dispatch_slave_request(MDRequest *mdr)
       bool need_issue = false;
       switch (op) {
       case MMDSSlaveRequest::OP_UNXLOCK:
-	mds->locker->xlock_finish(lock, mdr, &need_issue);
+	mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
 	break;
       case MMDSSlaveRequest::OP_UNWRLOCK:
-	mds->locker->wrlock_finish(lock, mdr, &need_issue);
+	mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
 	break;
       }
       if (need_issue)
@@ -1570,7 +1566,7 @@ void Server::dispatch_slave_request(MDRequest *mdr)
     break;
 
   case MMDSSlaveRequest::OP_DROPLOCKS:
-    mds->locker->drop_locks(mdr);
+    mds->locker->drop_locks(mdr.get());
     mdr->slave_request->put();
     mdr->slave_request = 0;
     break;
@@ -1606,7 +1602,7 @@ void Server::dispatch_slave_request(MDRequest *mdr)
 }
 
 /* This function DOES put the mdr->slave_request before returning*/
-void Server::handle_slave_auth_pin(MDRequest *mdr)
+void Server::handle_slave_auth_pin(MDRequestRef& mdr)
 {
   dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
 
@@ -1735,7 +1731,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr)
 }
 
 /* This function DOES NOT put the passed ack before returning*/
-void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
 {
   dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
   int from = ack->get_source().num();
@@ -1796,7 +1792,7 @@ void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
  * verify that the dir exists and would own the dname.
  * do not check if the dentry exists.
  */
-CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname)
+CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
 {
   // make sure parent is a dir?
   if (!diri->is_dir()) {
@@ -1826,7 +1822,7 @@ CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dn
  * prepare a null (or existing) dentry in given dir. 
  * wait for any dn lock.
  */
-CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist)
+CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
 {
   dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
   assert(dir->is_auth());
@@ -1869,7 +1865,7 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn
   return dn;
 }
 
-CDentry* Server::prepare_stray_dentry(MDRequest *mdr, CInode *in)
+CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
 {
   CDentry *straydn = mdr->straydn;
   if (straydn) {
@@ -1892,7 +1888,7 @@ CDentry* Server::prepare_stray_dentry(MDRequest *mdr, CInode *in)
  *
  * create a new inode.  set c/m/atime.  hit dir pop.
  */
-CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, unsigned mode,
+CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
 				  ceph_file_layout *layout) 
 {
   CInode *in = new CInode(mdcache);
@@ -1987,7 +1983,7 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, u
   return in;
 }
 
-void Server::journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob)
+void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
 {
   dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.projected
 	   << " inotablev " << mds->inotable->get_projected_version()
@@ -2000,7 +1996,7 @@ void Server::journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob)
 		      mds->inotable->get_projected_version());
 }
 
-void Server::apply_allocated_inos(MDRequest *mdr)
+void Server::apply_allocated_inos(MDRequestRef& mdr)
 {
   Session *session = mdr->session;
   dout(10) << "apply_allocated_inos " << mdr->alloc_ino
@@ -2026,7 +2022,7 @@ void Server::apply_allocated_inos(MDRequest *mdr)
 
 
 
-CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath)
+CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
 {
   // figure parent dir vs dname
   if (refpath.depth() == 0) {
@@ -2059,23 +2055,20 @@ CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, file
 
 class C_MDS_TryFindInode : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
 public:
-  C_MDS_TryFindInode(Server *s, MDRequest *r) : server(s), mdr(r) {
-    mdr->get();
-  }
+  C_MDS_TryFindInode(Server *s, MDRequestRef& r) : server(s), mdr(r) {}
   virtual void finish(int r) {
     if (r == -ESTALE) // :( find_ino_peers failed
       server->reply_request(mdr, r);
     else
       server->dispatch_client_request(mdr);
-    mdr->put();
   }
 };
 
 /* If this returns null, the request has been handled
  * as appropriate: forwarded on, or the client's been replied to */
-CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, int n,
+CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
 				    set<SimpleLock*> &rdlocks,
 				    bool want_auth,
 				    bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
@@ -2144,7 +2137,7 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, int n,
        * a single MDS request; otherwise we'd be in
        * rdlock_path_xlock_dentry.
        */
-      mds->locker->drop_locks(mdr, NULL);
+      mds->locker->drop_locks(mdr.get(), NULL);
       mdr->drop_local_auth_pins();
       return 0;
     }
@@ -2171,7 +2164,7 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, int n,
  * create null dentry in place (or use existing if okexist).
  * get rdlocks on traversed dentries, xlock on new dentry.
  */
-CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, int n,
+CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
 					  set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
 					  bool okexist, bool mustexist, bool alwaysxlock,
 					  ceph_file_layout **layout)
@@ -2275,7 +2268,7 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, int n,
  * @param mdr request
  * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
  */
-CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr)
+CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
 {
   CDir *dir = diri->get_dirfrag(fg);
 
@@ -2315,7 +2308,7 @@ CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr)
 // ===============================================================================
 // STAT
 
-void Server::handle_client_getattr(MDRequest *mdr, bool is_lookup)
+void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -2369,8 +2362,8 @@ void Server::handle_client_getattr(MDRequest *mdr, bool is_lookup)
 
 struct C_MDS_LookupIno2 : public Context {
   Server *server;
-  MDRequest *mdr;
-  C_MDS_LookupIno2(Server *s, MDRequest *r) : server(s), mdr(r) {}
+  MDRequestRef mdr;
+  C_MDS_LookupIno2(Server *s, MDRequestRef& r) : server(s), mdr(r) {}
   void finish(int r) {
     server->_lookup_ino_2(mdr, r);
   }
@@ -2380,7 +2373,8 @@ struct C_MDS_LookupIno2 : public Context {
 /*
  * filepath:  ino
  */
-void Server::handle_client_lookup_ino(MDRequest *mdr, bool want_parent, bool want_dentry)
+void Server::handle_client_lookup_ino(MDRequestRef& mdr,
+				      bool want_parent, bool want_dentry)
 {
   MClientRequest *req = mdr->client_request;
 
@@ -2431,10 +2425,10 @@ void Server::handle_client_lookup_ino(MDRequest *mdr, bool want_parent, bool wan
   }
 }
 
-void Server::_lookup_ino_2(MDRequest *mdr, int r)
+void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
 {
   inodeno_t ino = mdr->client_request->get_filepath().get_ino();
-  dout(10) << "_lookup_ino_2 " << mdr << " ino " << ino << " r=" << r << dendl;
+  dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
   if (r >= 0) {
     if (r == mds->get_nodeid())
       dispatch_client_request(mdr);
@@ -2451,7 +2445,7 @@ void Server::_lookup_ino_2(MDRequest *mdr, int r)
 
 
 /* This function takes responsibility for the passed mdr*/
-void Server::handle_client_open(MDRequest *mdr)
+void Server::handle_client_open(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
 
@@ -2609,12 +2603,12 @@ void Server::handle_client_open(MDRequest *mdr)
 
 class C_MDS_openc_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *dn;
   CInode *newi;
   snapid_t follows;
 public:
-  C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, snapid_t f) :
+  C_MDS_openc_finish(MDS *m, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
     mds(m), mdr(r), dn(d), newi(ni), follows(f) {}
   void finish(int r) {
     assert(r == 0);
@@ -2643,7 +2637,7 @@ public:
 };
 
 /* This function takes responsibility for the passed mdr*/
-void Server::handle_client_openc(MDRequest *mdr)
+void Server::handle_client_openc(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   client_t client = mdr->get_client();
@@ -2802,7 +2796,7 @@ void Server::handle_client_openc(MDRequest *mdr)
 
 
 
-void Server::handle_client_readdir(MDRequest *mdr)
+void Server::handle_client_readdir(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   client_t client = req->get_source().num();
@@ -2847,7 +2841,7 @@ void Server::handle_client_readdir(MDRequest *mdr)
   if (!dir->is_complete()) {
     if (dir->is_frozen()) {
       dout(7) << "dir is frozen " << *dir << dendl;
-      mds->locker->drop_locks(mdr);
+      mds->locker->drop_locks(mdr.get());
       mdr->drop_local_auth_pins();
       dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
       return;
@@ -2952,7 +2946,7 @@ void Server::handle_client_readdir(MDRequest *mdr)
 	  break;
 	}
 
-	mds->locker->drop_locks(mdr);
+	mds->locker->drop_locks(mdr.get());
 	mdr->drop_local_auth_pins();
 	mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
 	return;
@@ -3030,11 +3024,11 @@ void Server::handle_client_readdir(MDRequest *mdr)
  */
 class C_MDS_inode_update_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CInode *in;
   bool truncating_smaller, changed_ranges;
 public:
-  C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i,
+  C_MDS_inode_update_finish(MDS *m, MDRequestRef& r, CInode *i,
 			    bool sm=false, bool cr=false) :
     mds(m), mdr(r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
   void finish(int r) {
@@ -3059,7 +3053,7 @@ public:
   }
 };
 
-void Server::handle_client_file_setlock(MDRequest *mdr)
+void Server::handle_client_file_setlock(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -3140,7 +3134,7 @@ void Server::handle_client_file_setlock(MDRequest *mdr)
 	dout(10) << " added to waiting list" << dendl;
 	assert(lock_state->is_waiting(set_lock));
 	mdr->more()->flock_was_waiting = true;
-	mds->locker->drop_locks(mdr);
+	mds->locker->drop_locks(mdr.get());
 	mdr->drop_local_auth_pins();
 	cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
       }
@@ -3150,7 +3144,7 @@ void Server::handle_client_file_setlock(MDRequest *mdr)
   dout(10) << " state after lock change: " << *lock_state << dendl;
 }
 
-void Server::handle_client_file_readlock(MDRequest *mdr)
+void Server::handle_client_file_readlock(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -3204,7 +3198,7 @@ void Server::handle_client_file_readlock(MDRequest *mdr)
   reply_request(mdr, reply);
 }
 
-void Server::handle_client_setattr(MDRequest *mdr)
+void Server::handle_client_setattr(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -3299,7 +3293,7 @@ void Server::handle_client_setattr(MDRequest *mdr)
   // log + wait
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
   
   journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur,
 								   truncating_smaller, changed_ranges));
@@ -3310,7 +3304,7 @@ void Server::handle_client_setattr(MDRequest *mdr)
 }
 
 /* Takes responsibility for mdr */
-void Server::do_open_truncate(MDRequest *mdr, int cmode)
+void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
 {
   CInode *in = mdr->in[0];
   client_t client = mdr->get_client();
@@ -3318,6 +3312,9 @@ void Server::do_open_truncate(MDRequest *mdr, int cmode)
 
   dout(10) << "do_open_truncate " << *in << dendl;
 
+  SnapRealm *realm = in->find_snaprealm();
+  mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
+
   mdr->ls = mdlog->get_current_segment();
   EUpdate *le = new EUpdate(mdlog, "open_truncate");
   mdlog->start_entry(le);
@@ -3344,12 +3341,8 @@ void Server::do_open_truncate(MDRequest *mdr, int cmode)
   le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
 
   mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, in);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
   
-  // do the open
-  SnapRealm *realm = in->find_snaprealm();
-  mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
-
   // make sure ino gets into the journal
   le->metablob.add_opened_ino(in->ino());
   LogSegment *ls = mds->mdlog->get_current_segment();
@@ -3369,7 +3362,7 @@ void Server::do_open_truncate(MDRequest *mdr, int cmode)
 
 
 /* This function cleans up the passed mdr */
-void Server::handle_client_setlayout(MDRequest *mdr)
+void Server::handle_client_setlayout(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -3443,12 +3436,12 @@ void Server::handle_client_setlayout(MDRequest *mdr)
   mdlog->start_entry(le);
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
   
   journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
 }
 
-void Server::handle_client_setdirlayout(MDRequest *mdr)
+void Server::handle_client_setdirlayout(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -3519,7 +3512,7 @@ void Server::handle_client_setdirlayout(MDRequest *mdr)
   mdlog->start_entry(le);
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
 
   journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
 }
@@ -3607,7 +3600,7 @@ int Server::parse_layout_vxattr(string name, string value, ceph_file_layout *lay
   return 0;
 }
 
-void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur,
+void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
 			       ceph_file_layout *dir_layout,
 			       set<SimpleLock*> rdlocks,
 			       set<SimpleLock*> wrlocks,
@@ -3708,7 +3701,7 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur,
     mdlog->start_entry(le);
     le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
     mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-    mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+    mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
 
     journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
     return;
@@ -3718,7 +3711,7 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur,
   reply_request(mdr, -EINVAL);
 }
 
-void Server::handle_remove_vxattr(MDRequest *mdr, CInode *cur,
+void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
 				  set<SimpleLock*> rdlocks,
 				  set<SimpleLock*> wrlocks,
 				  set<SimpleLock*> xlocks)
@@ -3755,7 +3748,7 @@ void Server::handle_remove_vxattr(MDRequest *mdr, CInode *cur,
     mdlog->start_entry(le);
     le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
     mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-    mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+    mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
 
     journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
     return;
@@ -3766,11 +3759,11 @@ void Server::handle_remove_vxattr(MDRequest *mdr, CInode *cur,
 
 class C_MDS_inode_xattr_update_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CInode *in;
 public:
 
-  C_MDS_inode_xattr_update_finish(MDS *m, MDRequest *r, CInode *i) :
+  C_MDS_inode_xattr_update_finish(MDS *m, MDRequestRef& r, CInode *i) :
     mds(m), mdr(r), in(i) { }
   void finish(int r) {
     assert(r == 0);
@@ -3786,7 +3779,7 @@ public:
   }
 };
 
-void Server::handle_client_setxattr(MDRequest *mdr)
+void Server::handle_client_setxattr(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   string name(req->get_path2());
@@ -3852,12 +3845,12 @@ void Server::handle_client_setxattr(MDRequest *mdr)
   mdlog->start_entry(le);
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
 
   journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
 }
 
-void Server::handle_client_removexattr(MDRequest *mdr)
+void Server::handle_client_removexattr(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   string name(req->get_path2());
@@ -3908,7 +3901,7 @@ void Server::handle_client_removexattr(MDRequest *mdr)
   mdlog->start_entry(le);
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
 
   journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
 }
@@ -3924,12 +3917,12 @@ void Server::handle_client_removexattr(MDRequest *mdr)
 
 class C_MDS_mknod_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *dn;
   CInode *newi;
   snapid_t follows;
 public:
-  C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, snapid_t f) :
+  C_MDS_mknod_finish(MDS *m, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
     mds(m), mdr(r), dn(d), newi(ni), follows(f) {}
   void finish(int r) {
     assert(r == 0);
@@ -3971,7 +3964,7 @@ public:
 };
 
 
-void Server::handle_client_mknod(MDRequest *mdr)
+void Server::handle_client_mknod(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   client_t client = mdr->get_client();
@@ -4065,7 +4058,7 @@ void Server::handle_client_mknod(MDRequest *mdr)
 
 // MKDIR
 /* This function takes responsibility for the passed mdr*/
-void Server::handle_client_mkdir(MDRequest *mdr)
+void Server::handle_client_mkdir(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -4143,7 +4136,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
 
 // SYMLINK
 
-void Server::handle_client_symlink(MDRequest *mdr)
+void Server::handle_client_symlink(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -4197,7 +4190,7 @@ void Server::handle_client_symlink(MDRequest *mdr)
 
 // LINK
 
-void Server::handle_client_link(MDRequest *mdr)
+void Server::handle_client_link(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
 
@@ -4270,13 +4263,13 @@ void Server::handle_client_link(MDRequest *mdr)
 
 class C_MDS_link_local_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *dn;
   CInode *targeti;
   version_t dnpv;
   version_t tipv;
 public:
-  C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, 
+  C_MDS_link_local_finish(MDS *m, MDRequestRef& r, CDentry *d, CInode *ti,
 			  version_t dnpv_, version_t tipv_) :
     mds(m), mdr(r), dn(d), targeti(ti),
     dnpv(dnpv_), tipv(tipv_) { }
@@ -4287,7 +4280,7 @@ public:
 };
 
 
-void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti)
+void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
 {
   dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
 
@@ -4314,7 +4307,7 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti)
   mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);      // new dn
   mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY);           // targeti
   le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type());  // new remote
-  mdcache->journal_dirty_inode(mdr, &le->metablob, targeti);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
 
   // do this after predirty_*, to avoid funky extra dnl arg
   dn->push_projected_linkage(targeti->ino(), targeti->d_type());
@@ -4322,7 +4315,7 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti)
   journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv));
 }
 
-void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
+void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
 				version_t dnpv, version_t tipv)
 {
   dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
@@ -4352,13 +4345,13 @@ void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
 
 class C_MDS_link_remote_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   bool inc;
   CDentry *dn;
   CInode *targeti;
   version_t dpv;
 public:
-  C_MDS_link_remote_finish(MDS *m, MDRequest *r, bool i, CDentry *d, CInode *ti) :
+  C_MDS_link_remote_finish(MDS *m, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
     mds(m), mdr(r), inc(i), dn(d), targeti(ti),
     dpv(d->get_projected_version()) {}
   void finish(int r) {
@@ -4367,7 +4360,7 @@ public:
   }
 };
 
-void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti)
+void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
 {
   dout(10) << "_link_remote " 
 	   << (inc ? "link ":"unlink ")
@@ -4422,7 +4415,7 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti
   } else {
     dn->pre_dirty();
     mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
-    mdcache->journal_cow_dentry(mdr, &le->metablob, dn);
+    mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
     le->metablob.add_null_dentry(dn, true);
   }
 
@@ -4432,7 +4425,7 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti
   journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(mds, mdr, inc, dn, targeti));
 }
 
-void Server::_link_remote_finish(MDRequest *mdr, bool inc,
+void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
 				 CDentry *dn, CInode *targeti,
 				 version_t dpv)
 {
@@ -4459,8 +4452,10 @@ void Server::_link_remote_finish(MDRequest *mdr, bool inc,
 
   if (inc)
     mds->mdcache->send_dentry_link(dn);
-  else
-    mds->mdcache->send_dentry_unlink(dn, NULL, NULL);
+  else {
+    MDRequestRef null_ref;
+    mds->mdcache->send_dentry_unlink(dn, NULL, null_ref);
+  }
   
   // commit anchor update?
   if (mdr->more()->dst_reanchor_atid) 
@@ -4484,10 +4479,10 @@ void Server::_link_remote_finish(MDRequest *mdr, bool inc,
 
 class C_MDS_SlaveLinkPrep : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CInode *targeti;
 public:
-  C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t) :
+  C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
     server(s), mdr(r), targeti(t) { }
   void finish(int r) {
     assert(r == 0);
@@ -4497,10 +4492,10 @@ public:
 
 class C_MDS_SlaveLinkCommit : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CInode *targeti;
 public:
-  C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t) :
+  C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
     server(s), mdr(r), targeti(t) { }
   void finish(int r) {
     server->_commit_slave_link(mdr, r, targeti);
@@ -4508,7 +4503,7 @@ public:
 };
 
 /* This function DOES put the mdr->slave_request before returning*/
-void Server::handle_slave_link_prep(MDRequest *mdr)
+void Server::handle_slave_link_prep(MDRequestRef& mdr)
 {
   dout(10) << "handle_slave_link_prep " << *mdr 
 	   << " on " << mdr->slave_request->get_object_info() 
@@ -4583,7 +4578,7 @@ void Server::handle_slave_link_prep(MDRequest *mdr)
 
   // commit case
   mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY, 0);
-  mdcache->journal_dirty_inode(mdr, &le->commit, targeti);
+  mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
 
   // set up commit waiter
   mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
@@ -4592,7 +4587,7 @@ void Server::handle_slave_link_prep(MDRequest *mdr)
   mdlog->flush();
 }
 
-void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti) 
+void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
 {
   dout(10) << "_logged_slave_link " << *mdr
 	   << " " << *targeti << dendl;
@@ -4624,14 +4619,14 @@ void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti)
 
 struct C_MDS_CommittedSlave : public Context {
   Server *server;
-  MDRequest *mdr;
-  C_MDS_CommittedSlave(Server *s, MDRequest *m) : server(s), mdr(m) {}
+  MDRequestRef mdr;
+  C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : server(s), mdr(m) {}
   void finish(int r) {
     server->_committed_slave(mdr);
   }
 };
 
-void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti)
+void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
 {  
   dout(10) << "_commit_slave_link " << *mdr
 	   << " r=" << r
@@ -4653,7 +4648,7 @@ void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti)
   }
 }
 
-void Server::_committed_slave(MDRequest *mdr)
+void Server::_committed_slave(MDRequestRef& mdr)
 {
   dout(10) << "_committed_slave " << *mdr << dendl;
 
@@ -4667,15 +4662,15 @@ void Server::_committed_slave(MDRequest *mdr)
 
 struct C_MDS_LoggedLinkRollback : public Context {
   Server *server;
-  Mutation *mut;
-  MDRequest *mdr;
-  C_MDS_LoggedLinkRollback(Server *s, Mutation *m, MDRequest *r) : server(s), mut(m), mdr(r) {}
+  MutationRef mut;
+  MDRequestRef mdr;
+  C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : server(s), mut(m), mdr(r) {}
   void finish(int r) {
     server->_link_rollback_finish(mut, mdr);
   }
 };
 
-void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
+void Server::do_link_rollback(bufferlist &rbl, int master, MDRequestRef& mdr)
 {
   link_rollback rollback;
   bufferlist::iterator p = rbl.begin();
@@ -4691,7 +4686,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
   assert(mdr || mds->is_resolve());
 
-  Mutation *mut = new Mutation(rollback.reqid);
+  MutationRef mut(new MutationImpl(rollback.reqid));
   mut->ls = mds->mdlog->get_current_segment();
 
   CInode *in = mds->mdcache->get_inode(rollback.ino);
@@ -4735,7 +4730,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   mdlog->flush();
 }
 
-void Server::_link_rollback_finish(Mutation *mut, MDRequest *mdr)
+void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
 {
   dout(10) << "_link_rollback_finish" << dendl;
 
@@ -4748,12 +4743,11 @@ void Server::_link_rollback_finish(Mutation *mut, MDRequest *mdr)
   mds->mdcache->finish_rollback(mut->reqid);
 
   mut->cleanup();
-  delete mut;
 }
 
 
 /* This function DOES NOT put the passed message before returning*/
-void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m)
+void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
 {
   dout(10) << "handle_slave_link_prep_ack " << *mdr 
 	   << " " << *m << dendl;
@@ -4783,7 +4777,7 @@ void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m)
 
 // UNLINK
 
-void Server::handle_client_unlink(MDRequest *mdr)
+void Server::handle_client_unlink(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   client_t client = mdr->get_client();
@@ -4961,12 +4955,12 @@ void Server::handle_client_unlink(MDRequest *mdr)
 
 class C_MDS_unlink_local_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *dn;
   CDentry *straydn;
   version_t dnpv;  // deleted dentry
 public:
-  C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd) :
+  C_MDS_unlink_local_finish(MDS *m, MDRequestRef& r, CDentry *d, CDentry *sd) :
     mds(m), mdr(r), dn(d), straydn(sd),
     dnpv(d->get_projected_version()) {}
   void finish(int r) {
@@ -4975,7 +4969,7 @@ public:
   }
 };
 
-void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
+void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
 {
   dout(10) << "_unlink_local " << *dn << dendl;
 
@@ -5032,10 +5026,10 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
     // remote link.  update remote inode.
     mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
     mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
-    mdcache->journal_dirty_inode(mdr, &le->metablob, in);
+    mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
   }
 
-  mdcache->journal_cow_dentry(mdr, &le->metablob, dn);
+  mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
   le->metablob.add_null_dentry(dn, true);
 
   if (in->is_dir()) {
@@ -5056,7 +5050,7 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
   journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn));
 }
 
-void Server::_unlink_local_finish(MDRequest *mdr, 
+void Server::_unlink_local_finish(MDRequestRef& mdr,
 				  CDentry *dn, CDentry *straydn,
 				  version_t dnpv) 
 {
@@ -5112,7 +5106,7 @@ void Server::_unlink_local_finish(MDRequest *mdr,
   dn->get_dir()->try_remove_unlinked_dn(dn);
 }
 
-bool Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn)
+bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, int who, CDentry *dn, CDentry *straydn)
 {
   if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
     dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
@@ -5139,9 +5133,9 @@ bool Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentr
 
 struct C_MDS_SlaveRmdirPrep : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *dn, *straydn;
-  C_MDS_SlaveRmdirPrep(Server *s, MDRequest *r, CDentry *d, CDentry *st)
+  C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
     : server(s), mdr(r), dn(d), straydn(st) {}
   void finish(int r) {
     server->_logged_slave_rmdir(mdr, dn, straydn);
@@ -5150,15 +5144,15 @@ struct C_MDS_SlaveRmdirPrep : public Context {
 
 struct C_MDS_SlaveRmdirCommit : public Context {
   Server *server;
-  MDRequest *mdr;
-  C_MDS_SlaveRmdirCommit(Server *s, MDRequest *r)
+  MDRequestRef mdr;
+  C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r)
     : server(s), mdr(r) { }
   void finish(int r) {
     server->_commit_slave_rmdir(mdr, r);
   }
 };
 
-void Server::handle_slave_rmdir_prep(MDRequest *mdr)
+void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
 {
   dout(10) << "handle_slave_rmdir_prep " << *mdr 
 	   << " " << mdr->slave_request->srcdnpath 
@@ -5214,7 +5208,7 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
   mdlog->flush();
 }
 
-void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
+void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
 {
   dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
 
@@ -5241,7 +5235,7 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
   }
 }
 
-void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
 {
   dout(10) << "handle_slave_rmdir_prep_ack " << *mdr 
 	   << " " << *ack << dendl;
@@ -5261,7 +5255,7 @@ void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
     dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
 }
 
-void Server::_commit_slave_rmdir(MDRequest *mdr, int r)
+void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r)
 {
   dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
   
@@ -5282,18 +5276,18 @@ void Server::_commit_slave_rmdir(MDRequest *mdr, int r)
 
 struct C_MDS_LoggedRmdirRollback : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   metareqid_t reqid;
   CDentry *dn;
   CDentry *straydn;
-  C_MDS_LoggedRmdirRollback(Server *s, MDRequest *m, metareqid_t mr, CDentry *d, CDentry *st)
+  C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
     : server(s), mdr(m), reqid(mr), dn(d), straydn(st) {}
   void finish(int r) {
     server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
   }
 };
 
-void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
+void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequestRef& mdr)
 {
   // unlink the other rollback methods, the rmdir rollback is only
   // needed to record the subtree changes in the journal for inode
@@ -5342,7 +5336,7 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   mdlog->flush();
 }
 
-void Server::_rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
+void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
 {
   dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
 
@@ -5371,7 +5365,7 @@ void Server::_rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *
  * the unlocked varient this is a fastpath check.  we can't really be
  * sure until we rdlock the filelock.
  */
-bool Server::_dir_is_nonempty_unlocked(MDRequest *mdr, CInode *in)
+bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
 {
   dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
   assert(in->is_auth());
@@ -5396,7 +5390,7 @@ bool Server::_dir_is_nonempty_unlocked(MDRequest *mdr, CInode *in)
   return false;
 }
 
-bool Server::_dir_is_nonempty(MDRequest *mdr, CInode *in)
+bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
 {
   dout(10) << "dir_is_nonempty " << *in << dendl;
   assert(in->is_auth());
@@ -5431,12 +5425,12 @@ bool Server::_dir_is_nonempty(MDRequest *mdr, CInode *in)
 
 class C_MDS_rename_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *srcdn;
   CDentry *destdn;
   CDentry *straydn;
 public:
-  C_MDS_rename_finish(MDS *m, MDRequest *r,
+  C_MDS_rename_finish(MDS *m, MDRequestRef& r,
 		      CDentry *sdn, CDentry *ddn, CDentry *stdn) :
     mds(m), mdr(r),
     srcdn(sdn), destdn(ddn), straydn(stdn) { }
@@ -5461,7 +5455,7 @@ public:
  *
  * This function takes responsibility for the passed mdr.
  */
-void Server::handle_client_rename(MDRequest *mdr)
+void Server::handle_client_rename(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
   dout(7) << "handle_client_rename " << *req << dendl;
@@ -5894,7 +5888,7 @@ void Server::handle_client_rename(MDRequest *mdr)
 }
 
 
-void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
   dout(10) << "_rename_finish " << *mdr << dendl;
 
@@ -5945,7 +5939,7 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
 
 // helpers
 
-bool Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
+bool Server::_rename_prepare_witness(MDRequestRef& mdr, int who, set<int> &witnesse,
 				     CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
   if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
@@ -5975,7 +5969,7 @@ bool Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse
   return true;
 }
 
-version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl)
+version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
 {
   version_t oldpv = mdr->more()->inode_import_v;
 
@@ -6045,7 +6039,7 @@ bool Server::_need_force_journal(CInode *diri, bool empty)
   return force_journal;
 }
 
-void Server::_rename_prepare(MDRequest *mdr,
+void Server::_rename_prepare(MDRequestRef& mdr,
 			     EMetaBlob *metablob, bufferlist *client_map_bl,
 			     CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
@@ -6239,7 +6233,7 @@ void Server::_rename_prepare(MDRequest *mdr,
       if (oldin->is_auth()) {
 	// auth for targeti
 	metablob->add_dir_context(oldin->get_projected_parent_dir());
-	mdcache->journal_cow_dentry(mdr, metablob, oldin->get_projected_parent_dn(),
+	mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
 				    CEPH_NOSNAP, 0, destdnl);
 	metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
       }
@@ -6250,7 +6244,7 @@ void Server::_rename_prepare(MDRequest *mdr,
   if (srcdnl->is_remote()) {
     if (!linkmerge) {
       if (destdn->is_auth() && !destdnl->is_null())
-	mdcache->journal_cow_dentry(mdr, metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+	mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
       else
 	destdn->first = MAX(destdn->first, next_dest_snap);
 
@@ -6258,12 +6252,12 @@ void Server::_rename_prepare(MDRequest *mdr,
         metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
       if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
 	metablob->add_dir_context(srci->get_projected_parent_dir());
-        mdcache->journal_cow_dentry(mdr, metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
+        mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
 	metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
       }
     } else {
       if (destdn->is_auth() && !destdnl->is_null())
-	mdcache->journal_cow_dentry(mdr, metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+	mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
       else
 	destdn->first = MAX(destdn->first, next_dest_snap);
 
@@ -6277,7 +6271,7 @@ void Server::_rename_prepare(MDRequest *mdr,
       srci->project_past_snaprealm_parent(dest_realm);
     
     if (destdn->is_auth() && !destdnl->is_null())
-      mdcache->journal_cow_dentry(mdr, metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+      mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
     else
       destdn->first = MAX(destdn->first, next_dest_snap);
 
@@ -6303,7 +6297,7 @@ void Server::_rename_prepare(MDRequest *mdr,
   // src
   if (srcdn->is_auth()) {
     dout(10) << " journaling srcdn " << *srcdn << dendl;
-    mdcache->journal_cow_dentry(mdr, metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
+    mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
     // also journal the inode in case we need do slave rename rollback. It is Ok to add
     // both primary and NULL dentries. Because during journal replay, null dentry is
     // processed after primary dentry.
@@ -6331,7 +6325,7 @@ void Server::_rename_prepare(MDRequest *mdr,
 }
 
 
-void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
   dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
   dout(10) << " pvs " << mdr->more()->pvmap << dendl;
@@ -6430,7 +6424,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
 	  ++i)
 	if ((*i)->get_parent() == destdnl->get_inode() &&
 	    !(*i)->is_locallock())
-	  mds->locker->xlock_import(*i, mdr);
+	  mds->locker->xlock_import(*i);
       
       // hack: fix auth bit
       in->state_set(CInode::STATE_AUTH);
@@ -6476,10 +6470,10 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
 
 class C_MDS_SlaveRenamePrep : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *srcdn, *destdn, *straydn;
 public:
-  C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) :
+  C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
     server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
   void finish(int r) {
     server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
@@ -6488,10 +6482,10 @@ public:
 
 class C_MDS_SlaveRenameCommit : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CDentry *srcdn, *destdn, *straydn;
 public:
-  C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) :
+  C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
     server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
   void finish(int r) {
     server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
@@ -6500,20 +6494,17 @@ public:
 
 class C_MDS_SlaveRenameSessionsFlushed : public Context {
   Server *server;
-  MDRequest *mdr;
+  MDRequestRef mdr;
 public:
-  C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequest *r) :
-    server(s), mdr(r) {
-      mdr->get();
-    }
+  C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
+    server(s), mdr(r) {}
   void finish(int r) {
     server->_slave_rename_sessions_flushed(mdr);
-    mdr->put();
   }
 };
 
 /* This function DOES put the mdr->slave_request before returning*/
-void Server::handle_slave_rename_prep(MDRequest *mdr)
+void Server::handle_slave_rename_prep(MDRequestRef& mdr)
 {
   dout(10) << "handle_slave_rename_prep " << *mdr 
 	   << " " << mdr->slave_request->srcdnpath 
@@ -6707,7 +6698,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
   mdlog->flush();
 }
 
-void Server::_logged_slave_rename(MDRequest *mdr, 
+void Server::_logged_slave_rename(MDRequestRef& mdr,
 				  CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
   dout(10) << "_logged_slave_rename " << *mdr << dendl;
@@ -6779,7 +6770,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
   }
 }
 
-void Server::_commit_slave_rename(MDRequest *mdr, int r,
+void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
 				  CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
   dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
@@ -6809,7 +6800,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
 	// we only care about xlocks on the exported inode
 	if (lock->get_parent() == in &&
 	    !lock->is_locallock())
-	  mds->locker->xlock_export(lock, mdr);
+	  mds->locker->xlock_export(lock, mdr.get());
       }
 
       map<client_t,Capability::Import> peer_imported;
@@ -6870,7 +6861,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
   }
 }
 
-void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, utime_t ctime, 
+void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
 			  bool isdir, int linkunlink, nest_info_t &rstat)
 {
   fnode_t *pf;
@@ -6903,14 +6894,14 @@ void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, ut
 
 struct C_MDS_LoggedRenameRollback : public Context {
   Server *server;
-  Mutation *mut;
-  MDRequest *mdr;
+  MutationRef mut;
+  MDRequestRef mdr;
   CDentry *srcdn;
   version_t srcdnpv;
   CDentry *destdn;
   CDentry *straydn;
   bool finish_mdr;
-  C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r,
+  C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
 			     CDentry *sd, version_t pv, CDentry *dd,
 			    CDentry *st, bool f) :
     server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd),
@@ -6921,7 +6912,7 @@ struct C_MDS_LoggedRenameRollback : public Context {
   }
 };
 
-void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr,
+void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequestRef& mdr,
 				bool finish_mdr)
 {
   rename_rollback rollback;
@@ -6932,7 +6923,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr,
   // need to finish this update before sending resolve to claim the subtree
   mds->mdcache->add_rollback(rollback.reqid, master);
 
-  Mutation *mut = new Mutation(rollback.reqid);
+  MutationRef mut(new MutationImpl(rollback.reqid));
   mut->ls = mds->mdlog->get_current_segment();
 
   CDentry *srcdn = NULL;
@@ -7124,7 +7115,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr,
   
   if (target && target->is_dir()) {
     assert(destdn);
-    mdcache->project_subtree_rename(in, straydir, destdir);
+    mdcache->project_subtree_rename(target, straydir, destdir);
   }
 
   if (in && in->is_dir()) {
@@ -7137,7 +7128,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr,
   mdlog->flush();
 }
 
-void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
+void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
 				     version_t srcdnpv, CDentry *destdn,
 				     CDentry *straydn, bool finish_mdr)
 {
@@ -7204,11 +7195,10 @@ void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *src
   mds->mdcache->finish_rollback(mut->reqid);
 
   mut->cleanup();
-  delete mut;
 }
 
 /* This function DOES put the passed message before returning*/
-void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
 {
   dout(10) << "handle_slave_rename_prep_ack " << *mdr 
 	   << " witnessed by " << ack->get_source()
@@ -7245,7 +7235,7 @@ void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
     dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
 }
 
-void Server::handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
 {
   dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
 	   << ack->get_source() << dendl;
@@ -7264,7 +7254,7 @@ void Server::handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *ac
   }
 }
 
-void Server::_slave_rename_sessions_flushed(MDRequest *mdr)
+void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
 {
   dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
 
@@ -7282,7 +7272,7 @@ void Server::_slave_rename_sessions_flushed(MDRequest *mdr)
 
 // snaps
 /* This function takes responsibility for the passed mdr*/
-void Server::handle_client_lssnap(MDRequest *mdr)
+void Server::handle_client_lssnap(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
 
@@ -7347,10 +7337,10 @@ void Server::handle_client_lssnap(MDRequest *mdr)
 
 struct C_MDS_mksnap_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CInode *diri;
   SnapInfo info;
-  C_MDS_mksnap_finish(MDS *m, MDRequest *r, CInode *di, SnapInfo &i) :
+  C_MDS_mksnap_finish(MDS *m, MDRequestRef& r, CInode *di, SnapInfo &i) :
     mds(m), mdr(r), diri(di), info(i) {}
   void finish(int r) {
     mds->server->_mksnap_finish(mdr, diri, info);
@@ -7358,7 +7348,7 @@ struct C_MDS_mksnap_finish : public Context {
 };
 
 /* This function takes responsibility for the passed mdr*/
-void Server::handle_client_mksnap(MDRequest *mdr)
+void Server::handle_client_mksnap(MDRequestRef& mdr)
 {
   if (!mds->mdsmap->allows_snaps()) {
     // you can't make snapshots until you set an option right now
@@ -7466,14 +7456,14 @@ void Server::handle_client_mksnap(MDRequest *mdr)
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   le->metablob.add_table_transaction(TABLE_SNAP, stid);
   mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, diri);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
 
   // journal the snaprealm changes
   mdlog->submit_entry(le, new C_MDS_mksnap_finish(mds, mdr, diri, info));
   mdlog->flush();
 }
 
-void Server::_mksnap_finish(MDRequest *mdr, CInode *diri, SnapInfo &info)
+void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
 {
   dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
 
@@ -7502,10 +7492,10 @@ void Server::_mksnap_finish(MDRequest *mdr, CInode *diri, SnapInfo &info)
 
 struct C_MDS_rmsnap_finish : public Context {
   MDS *mds;
-  MDRequest *mdr;
+  MDRequestRef mdr;
   CInode *diri;
   snapid_t snapid;
-  C_MDS_rmsnap_finish(MDS *m, MDRequest *r, CInode *di, snapid_t sn) :
+  C_MDS_rmsnap_finish(MDS *m, MDRequestRef& r, CInode *di, snapid_t sn) :
     mds(m), mdr(r), diri(di), snapid(sn) {}
   void finish(int r) {
     mds->server->_rmsnap_finish(mdr, diri, snapid);
@@ -7513,7 +7503,7 @@ struct C_MDS_rmsnap_finish : public Context {
 };
 
 /* This function takes responsibility for the passed mdr*/
-void Server::handle_client_rmsnap(MDRequest *mdr)
+void Server::handle_client_rmsnap(MDRequestRef& mdr)
 {
   MClientRequest *req = mdr->client_request;
 
@@ -7585,13 +7575,13 @@ void Server::handle_client_rmsnap(MDRequest *mdr)
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   le->metablob.add_table_transaction(TABLE_SNAP, stid);
   mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
-  mdcache->journal_dirty_inode(mdr, &le->metablob, diri);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
 
   mdlog->submit_entry(le, new C_MDS_rmsnap_finish(mds, mdr, diri, snapid));
   mdlog->flush();
 }
 
-void Server::_rmsnap_finish(MDRequest *mdr, CInode *diri, snapid_t snapid)
+void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
 {
   dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
   snapid_t stid = mdr->more()->stid;
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 6ae9f59..95a5ae4 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -19,13 +19,16 @@
 
 class PerfCounters;
 class LogEvent;
-struct MDRequest;
-struct Mutation;
 class EMetaBlob;
 class EUpdate;
 class MMDSSlaveRequest;
 struct SnapInfo;
 
+struct MutationImpl;
+struct MDRequestImpl;
+typedef ceph::shared_ptr<MutationImpl> MutationRef;
+typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
+
 enum {
   l_mdss_first = 1000,
   l_mdss_hcreq,
@@ -98,16 +101,16 @@ public:
   // -- requests --
   void handle_client_request(MClientRequest *m);
 
-  void journal_and_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn, 
+  void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn,
 			 LogEvent *le, Context *fin);
-  void dispatch_client_request(MDRequest *mdr);
-  void early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn);
-  void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0, CDentry *tracedn = 0);
-  void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei = 0, CDentry *tracedn = 0);
+  void dispatch_client_request(MDRequestRef& mdr);
+  void early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn);
+  void reply_request(MDRequestRef& mdr, int r = 0, CInode *tracei = 0, CDentry *tracedn = 0);
+  void reply_request(MDRequestRef& mdr, MClientReply *reply, CInode *tracei = 0, CDentry *tracedn = 0);
   void set_trace_dist(Session *session, MClientReply *reply, CInode *in, CDentry *dn,
 		      snapid_t snapid,
 		      int num_dentries_wanted,
-		      MDRequest *mdr);
+		      MDRequestRef& mdr);
 
   void encode_empty_dirstat(bufferlist& bl);
   void encode_infinite_lease(bufferlist& bl);
@@ -115,136 +118,140 @@ public:
 
   void handle_slave_request(MMDSSlaveRequest *m);
   void handle_slave_request_reply(MMDSSlaveRequest *m);
-  void dispatch_slave_request(MDRequest *mdr);
-  void handle_slave_auth_pin(MDRequest *mdr);
-  void handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack);
+  void dispatch_slave_request(MDRequestRef& mdr);
+  void handle_slave_auth_pin(MDRequestRef& mdr);
+  void handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack);
 
   // some helpers
-  CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname);
-  CDir *traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath);
-  CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false);
-  CDentry *prepare_stray_dentry(MDRequest *mdr, CInode *in);
-  CInode* prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, unsigned mode,
+  CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname);
+  CDir *traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath);
+  CDentry *prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist=false);
+  CDentry *prepare_stray_dentry(MDRequestRef& mdr, CInode *in);
+  CInode* prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
 			    ceph_file_layout *layout=NULL);
-  void journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob);
-  void apply_allocated_inos(MDRequest *mdr);
+  void journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob);
+  void apply_allocated_inos(MDRequestRef& mdr);
 
-  CInode* rdlock_path_pin_ref(MDRequest *mdr, int n, set<SimpleLock*>& rdlocks, bool want_auth,
+  CInode* rdlock_path_pin_ref(MDRequestRef& mdr, int n, set<SimpleLock*>& rdlocks, bool want_auth,
 			      bool no_want_auth=false,
 			      ceph_file_layout **layout=NULL,
 			      bool no_lookup=false);
-  CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, int n, set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, 
-				    set<SimpleLock*>& xlocks, bool okexist, bool mustexist, bool alwaysxlock,
+  CDentry* rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
+                                    set<SimpleLock*>& rdlocks,
+                                    set<SimpleLock*>& wrlocks,
+				    set<SimpleLock*>& xlocks, bool okexist,
+				    bool mustexist, bool alwaysxlock,
 				    ceph_file_layout **layout=NULL);
 
-  CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr);
+  CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr);
 
 
   // requests on existing inodes.
-  void handle_client_getattr(MDRequest *mdr, bool is_lookup);
-  void handle_client_lookup_ino(MDRequest *mdr, bool want_parent, bool want_dentry);
-  void _lookup_ino_2(MDRequest *mdr, int r);
-  void handle_client_readdir(MDRequest *mdr);
-  void handle_client_file_setlock(MDRequest *mdr);
-  void handle_client_file_readlock(MDRequest *mdr);
-
-  void handle_client_setattr(MDRequest *mdr);
-  void handle_client_setlayout(MDRequest *mdr);
-  void handle_client_setdirlayout(MDRequest *mdr);
+  void handle_client_getattr(MDRequestRef& mdr, bool is_lookup);
+  void handle_client_lookup_ino(MDRequestRef& mdr,
+				bool want_parent, bool want_dentry);
+  void _lookup_ino_2(MDRequestRef& mdr, int r);
+  void handle_client_readdir(MDRequestRef& mdr);
+  void handle_client_file_setlock(MDRequestRef& mdr);
+  void handle_client_file_readlock(MDRequestRef& mdr);
+
+  void handle_client_setattr(MDRequestRef& mdr);
+  void handle_client_setlayout(MDRequestRef& mdr);
+  void handle_client_setdirlayout(MDRequestRef& mdr);
 
   int parse_layout_vxattr(string name, string value, ceph_file_layout *layout);
-  void handle_set_vxattr(MDRequest *mdr, CInode *cur,
+  void handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
 			 ceph_file_layout *dir_layout,
 			 set<SimpleLock*> rdlocks,
 			 set<SimpleLock*> wrlocks,
 			 set<SimpleLock*> xlocks);
-  void handle_remove_vxattr(MDRequest *mdr, CInode *cur,
+  void handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
 			    set<SimpleLock*> rdlocks,
 			    set<SimpleLock*> wrlocks,
 			    set<SimpleLock*> xlocks);
-  void handle_client_setxattr(MDRequest *mdr);
-  void handle_client_removexattr(MDRequest *mdr);
+  void handle_client_setxattr(MDRequestRef& mdr);
+  void handle_client_removexattr(MDRequestRef& mdr);
 
-  void handle_client_fsync(MDRequest *mdr);
+  void handle_client_fsync(MDRequestRef& mdr);
 
   // open
-  void handle_client_open(MDRequest *mdr);
-  void handle_client_openc(MDRequest *mdr);  // O_CREAT variant.
-  void do_open_truncate(MDRequest *mdr, int cmode);  // O_TRUNC variant.
+  void handle_client_open(MDRequestRef& mdr);
+  void handle_client_openc(MDRequestRef& mdr);  // O_CREAT variant.
+  void do_open_truncate(MDRequestRef& mdr, int cmode);  // O_TRUNC variant.
 
   // namespace changes
-  void handle_client_mknod(MDRequest *mdr);
-  void handle_client_mkdir(MDRequest *mdr);
-  void handle_client_symlink(MDRequest *mdr);
+  void handle_client_mknod(MDRequestRef& mdr);
+  void handle_client_mkdir(MDRequestRef& mdr);
+  void handle_client_symlink(MDRequestRef& mdr);
 
   // link
-  void handle_client_link(MDRequest *mdr);
-  void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti);
-  void _link_local_finish(MDRequest *mdr,
+  void handle_client_link(MDRequestRef& mdr);
+  void _link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti);
+  void _link_local_finish(MDRequestRef& mdr,
 			  CDentry *dn, CInode *targeti,
 			  version_t, version_t);
 
-  void _link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti);
-  void _link_remote_finish(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti,
+  void _link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti);
+  void _link_remote_finish(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti,
 			   version_t);
 
-  void handle_slave_link_prep(MDRequest *mdr);
-  void _logged_slave_link(MDRequest *mdr, CInode *targeti);
-  void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti);
-  void _committed_slave(MDRequest *mdr);  // use for rename, too
-  void handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
-  void do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr);
-  void _link_rollback_finish(Mutation *mut, MDRequest *mdr);
+  void handle_slave_link_prep(MDRequestRef& mdr);
+  void _logged_slave_link(MDRequestRef& mdr, CInode *targeti);
+  void _commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti);
+  void _committed_slave(MDRequestRef& mdr);  // use for rename, too
+  void handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m);
+  void do_link_rollback(bufferlist &rbl, int master, MDRequestRef& mdr);
+  void _link_rollback_finish(MutationRef& mut, MDRequestRef& mdr);
 
   // unlink
-  void handle_client_unlink(MDRequest *mdr);
-  bool _dir_is_nonempty_unlocked(MDRequest *mdr, CInode *rmdiri);
-  bool _dir_is_nonempty(MDRequest *mdr, CInode *rmdiri);
-  void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn);
-  void _unlink_local_finish(MDRequest *mdr, 
+  void handle_client_unlink(MDRequestRef& mdr);
+  bool _dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *rmdiri);
+  bool _dir_is_nonempty(MDRequestRef& mdr, CInode *rmdiri);
+  void _unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn);
+  void _unlink_local_finish(MDRequestRef& mdr,
 			    CDentry *dn, CDentry *straydn,
 			    version_t);
-  bool _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn);
-  void handle_slave_rmdir_prep(MDRequest *mdr);
-  void _logged_slave_rmdir(MDRequest *mdr, CDentry *srcdn, CDentry *straydn);
-  void _commit_slave_rmdir(MDRequest *mdr, int r);
-  void handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack);
-  void do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr);
-  void _rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn);
+  bool _rmdir_prepare_witness(MDRequestRef& mdr, int who, CDentry *dn, CDentry *straydn);
+  void handle_slave_rmdir_prep(MDRequestRef& mdr);
+  void _logged_slave_rmdir(MDRequestRef& mdr, CDentry *srcdn, CDentry *straydn);
+  void _commit_slave_rmdir(MDRequestRef& mdr, int r);
+  void handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack);
+  void do_rmdir_rollback(bufferlist &rbl, int master, MDRequestRef& mdr);
+  void _rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn);
 
   // rename
-  void handle_client_rename(MDRequest *mdr);
-  void _rename_finish(MDRequest *mdr,
+  void handle_client_rename(MDRequestRef& mdr);
+  void _rename_finish(MDRequestRef& mdr,
 		      CDentry *srcdn, CDentry *destdn, CDentry *straydn);
 
-  void handle_client_lssnap(MDRequest *mdr);
-  void handle_client_mksnap(MDRequest *mdr);
-  void _mksnap_finish(MDRequest *mdr, CInode *diri, SnapInfo &info);
-  void handle_client_rmsnap(MDRequest *mdr);
-  void _rmsnap_finish(MDRequest *mdr, CInode *diri, snapid_t snapid);
+  void handle_client_lssnap(MDRequestRef& mdr);
+  void handle_client_mksnap(MDRequestRef& mdr);
+  void _mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info);
+  void handle_client_rmsnap(MDRequestRef& mdr);
+  void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
 
   // helpers
-  bool _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
+  bool _rename_prepare_witness(MDRequestRef& mdr, int who, set<int> &witnesse,
 			       CDentry *srcdn, CDentry *destdn, CDentry *straydn);
-  version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl);
+  version_t _rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl);
   bool _need_force_journal(CInode *diri, bool empty);
-  void _rename_prepare(MDRequest *mdr,
+  void _rename_prepare(MDRequestRef& mdr,
 		       EMetaBlob *metablob, bufferlist *client_map_bl,
 		       CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   /* set not_journaling=true if you're going to discard the results --
    * this bypasses the asserts to make sure we're journaling the right
    * things on the right nodes */
-  void _rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); 
+  void _rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
 
   // slaving
-  void handle_slave_rename_prep(MDRequest *mdr);
-  void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
-  void handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *m);
-  void _slave_rename_sessions_flushed(MDRequest *mdr);
-  void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
-  void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
-  void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false);
-  void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, version_t srcdnpv,
+  void handle_slave_rename_prep(MDRequestRef& mdr);
+  void handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m);
+  void handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *m);
+  void _slave_rename_sessions_flushed(MDRequestRef& mdr);
+  void _logged_slave_rename(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+  void _commit_slave_rename(MDRequestRef& mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+  void do_rename_rollback(bufferlist &rbl, int master, MDRequestRef& mdr, bool finish_mdr=false);
+  void _rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, version_t srcdnpv,
 			       CDentry *destdn, CDentry *staydn, bool finish_mdr);
 
 };
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index e06a7b7..ac7fd46 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -27,7 +27,7 @@ using std::set;
 #include "mdstypes.h"
 
 class CInode;
-struct MDRequest;
+struct MDRequestImpl;
 
 #include "CInode.h"
 #include "Capability.h"
@@ -88,7 +88,7 @@ public:
 
   list<Message*> preopen_out_queue;  ///< messages for client, queued before they connect
 
-  elist<MDRequest*> requests;
+  elist<MDRequestImpl*> requests;
 
   interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
 
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
index 4b9b7f5..cbaa996 100644
--- a/src/mds/SimpleLock.h
+++ b/src/mds/SimpleLock.h
@@ -38,7 +38,9 @@ inline const char *get_lock_type_name(int t) {
   }
 }
 
-struct Mutation;
+#include "include/memory.h"
+struct MutationImpl;
+typedef ceph::shared_ptr<MutationImpl> MutationRef;
 
 extern "C" {
 #include "locks.h"
@@ -166,7 +168,7 @@ private:
 
     // local state
     int num_wrlock, num_xlock;
-    Mutation *xlock_by;
+    MutationRef xlock_by;
     client_t xlock_by_client;
     client_t excl_client;
 
@@ -175,14 +177,14 @@ private:
 	gather_set.empty() &&
 	num_wrlock == 0 &&
 	num_xlock == 0 &&
-	xlock_by == NULL &&
+	xlock_by.get() == NULL &&
 	xlock_by_client == -1 &&
 	excl_client == -1;
     }
 
     unstable_bits_t() : num_wrlock(0),
 			num_xlock(0),
-			xlock_by(NULL),
+			xlock_by(),
 			xlock_by_client(-1),
 			excl_client(-1) {}
   };
@@ -476,7 +478,7 @@ public:
   }
 
   // xlock
-  void get_xlock(Mutation *who, client_t client) { 
+  void get_xlock(MutationRef who, client_t client) { 
     assert(get_xlock_by() == 0);
     assert(state == LOCK_XLOCK || is_locallock() ||
 	   state == LOCK_LOCK /* if we are a slave */);
@@ -491,7 +493,7 @@ public:
 	   state == LOCK_LOCK /* if we are a slave */);
     if (!is_locallock())
       state = LOCK_XLOCKDONE;
-    more()->xlock_by = 0;
+    more()->xlock_by.reset();
   }
   void put_xlock() {
     assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || is_locallock() ||
@@ -499,7 +501,7 @@ public:
     --more()->num_xlock;
     parent->put(MDSCacheObject::PIN_LOCK);
     if (more()->num_xlock == 0) {
-      more()->xlock_by = 0;
+      more()->xlock_by.reset();
       more()->xlock_by_client = -1;
       try_clear_more();
     }
@@ -516,11 +518,8 @@ public:
   bool is_xlocked_by_client(client_t c) const {
     return have_more() ? more()->xlock_by_client == c : false;
   }
-  Mutation *get_xlock_by() {
-    return have_more() ? more()->xlock_by : NULL;
-  }
-  const Mutation *get_xlock_by() const {
-    return have_more() ? more()->xlock_by : NULL;
+  MutationRef get_xlock_by() const {
+    return have_more() ? more()->xlock_by : MutationRef();
   }
   
   // lease
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 2bf24c3..2b8bef0 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -62,7 +62,7 @@
 // -----------------------
 // LogSegment
 
-void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
+void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld, int op_prio)
 {
   set<CDir*> commit;
 
@@ -103,7 +103,7 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
       assert(dir->is_auth());
       if (dir->can_auth_pin()) {
 	dout(15) << "try_to_expire committing " << *dir << dendl;
-	dir->commit(0, gather_bld.new_sub());
+	dir->commit(0, gather_bld.new_sub(), false, op_prio);
       } else {
 	dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
 	dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
diff --git a/src/messages/MAuthReply.h b/src/messages/MAuthReply.h
index 76359a5..5fea5a5 100644
--- a/src/messages/MAuthReply.h
+++ b/src/messages/MAuthReply.h
@@ -16,6 +16,7 @@
 #define CEPH_MAUTHREPLY_H
 
 #include "msg/Message.h"
+#include "common/errno.h"
 
 struct MAuthReply : public Message {
   __u32 protocol;
@@ -38,8 +39,7 @@ private:
 public:
   const char *get_type_name() const { return "auth_reply"; }
   void print(ostream& o) const {
-    char buf[80];
-    o << "auth_reply(proto " << protocol << " " << result << " " << strerror_r(-result, buf, sizeof(buf));
+    o << "auth_reply(proto " << protocol << " " << result << " " << cpp_strerror(result);
     if (result_msg.length())
       o << ": " << result_msg;
     o << ")";
diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h
index e9abb85..6507fa6 100644
--- a/src/messages/MClientReply.h
+++ b/src/messages/MClientReply.h
@@ -21,6 +21,7 @@
 
 #include "msg/Message.h"
 #include "include/ceph_features.h"
+#include "common/errno.h"
 
 #include <vector>
 using namespace std;
@@ -229,8 +230,7 @@ public:
     o << "client_reply(???:" << get_tid();
     o << " = " << get_result();
     if (get_result() <= 0) {
-      char buf[80];
-      o << " " << strerror_r(-get_result(), buf, sizeof(buf));
+      o << " " << cpp_strerror(get_result());
     }
     if (head.op & CEPH_MDS_OP_WRITE) {
       if (head.safe)
diff --git a/src/messages/MMonProbe.h b/src/messages/MMonProbe.h
index 4d01d56..2f8a60b 100644
--- a/src/messages/MMonProbe.h
+++ b/src/messages/MMonProbe.h
@@ -22,7 +22,7 @@
 
 class MMonProbe : public Message {
 public:
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 5;
 
   enum {
@@ -31,6 +31,7 @@ public:
     OP_SLURP = 3,
     OP_SLURP_LATEST = 4,
     OP_DATA = 5,
+    OP_MISSING_FEATURES = 6,
   };
 
   static const char *get_opname(int o) {
@@ -40,6 +41,7 @@ public:
     case OP_SLURP: return "slurp";
     case OP_SLURP_LATEST: return "slurp_latest";
     case OP_DATA: return "data";
+    case OP_MISSING_FEATURES: return "missing_features";
     default: assert(0); return 0;
     }
   }
@@ -52,6 +54,7 @@ public:
   version_t paxos_first_version;
   version_t paxos_last_version;
   bool has_ever_joined;
+  uint64_t required_features;
 
   MMonProbe()
     : Message(MSG_MON_PROBE, HEAD_VERSION, COMPAT_VERSION) {}
@@ -62,7 +65,8 @@ public:
       name(n),
       paxos_first_version(0),
       paxos_last_version(0),
-      has_ever_joined(hej) {}
+      has_ever_joined(hej),
+      required_features(0) {}
 private:
   ~MMonProbe() {}
 
@@ -80,6 +84,8 @@ public:
     }
     if (!has_ever_joined)
       out << " new";
+    if (required_features)
+      out << " required_features " << required_features;
     out << ")";
   }
   
@@ -100,6 +106,7 @@ public:
     ::encode(has_ever_joined, payload);
     ::encode(paxos_first_version, payload);
     ::encode(paxos_last_version, payload);
+    ::encode(required_features, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
@@ -111,6 +118,10 @@ public:
     ::decode(has_ever_joined, p);
     ::decode(paxos_first_version, p);
     ::decode(paxos_last_version, p);
+    if (header.version >= 6)
+      ::decode(required_features, p);
+    else
+      required_features = 0;
   }
 };
 
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index 9739ae1..91c50e7 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -20,6 +20,7 @@
 
 #include "MOSDOp.h"
 #include "os/ObjectStore.h"
+#include "common/errno.h"
 
 /*
  * OSD op reply
@@ -262,8 +263,7 @@ public:
       out << " ack";
     out << " = " << get_result();
     if (get_result() < 0) {
-      char buf[80];
-      out << " (" << strerror_r(-get_result(), buf, sizeof(buf)) << ")";
+      out << " (" << cpp_strerror(get_result()) << ")";
     }
     if (is_redirect_reply()) {
       out << " redirect: { " << redirect << " }";
diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h
index 7c9c504..6a38186 100644
--- a/src/messages/MOSDSubOp.h
+++ b/src/messages/MOSDSubOp.h
@@ -25,7 +25,7 @@
 
 class MOSDSubOp : public Message {
 
-  static const int HEAD_VERSION = 9;
+  static const int HEAD_VERSION = 10;
   static const int COMPAT_VERSION = 1;
 
 public:
@@ -90,6 +90,9 @@ public:
   hobject_t new_temp_oid;      ///< new temp object that we must now start tracking
   hobject_t discard_temp_oid;  ///< previously used temp object that we can now stop tracking
 
+  /// non-empty if this transaction involves a hit_set history update
+  boost::optional<pg_hit_set_history_t> updated_hit_set_history;
+
   int get_cost() const {
     if (ops.size() == 1 && ops[0].op.op == CEPH_OSD_OP_PULL)
       return ops[0].op.extent.length;
@@ -169,6 +172,9 @@ public:
 	ghobject_t::NO_SHARD);
       pgid.shard = ghobject_t::NO_SHARD;
     }
+    if (header.version >= 10) {
+      ::decode(updated_hit_set_history, p);
+    }
   }
 
   virtual void encode_payload(uint64_t features) {
@@ -217,6 +223,7 @@ public:
     ::encode(discard_temp_oid, payload);
     ::encode(from, payload);
     ::encode(pgid.shard, payload);
+    ::encode(updated_hit_set_history, payload);
   }
 
   MOSDSubOp()
@@ -258,6 +265,8 @@ public:
     out << " v " << version
 	<< " snapset=" << snapset << " snapc=" << snapc;    
     if (!data_subset.empty()) out << " subset " << data_subset;
+    if (updated_hit_set_history)
+      out << ", has_updated_hit_set_history";
     out << ")";
   }
 };
diff --git a/src/mon/ConfigKeyService.cc b/src/mon/ConfigKeyService.cc
index 901f9bf..d2e204d 100644
--- a/src/mon/ConfigKeyService.cc
+++ b/src/mon/ConfigKeyService.cc
@@ -23,6 +23,7 @@
 
 #include "common/config.h"
 #include "common/cmdparse.h"
+#include "common/errno.h"
 
 #define dout_subsys ceph_subsys_mon
 #undef dout_prefix
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 9b707cd..8dda929 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -38,6 +38,7 @@
 #include "include/Context.h"
 #include "include/assert.h"
 #include "common/Formatter.h"
+#include "common/errno.h"
 
 #include "mon/Monitor.h"
 #include "mon/QuorumService.h"
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 3572c8b..2ee10f7 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -75,7 +75,6 @@ void Elector::start()
 
   acked_me.clear();
   classic_mons.clear();
-  required_features = mon->apply_compatset_features_to_quorum_requirements();
   init();
   
   // start by trying to elect me
@@ -214,6 +213,7 @@ void Elector::handle_propose(MMonElection *m)
   int from = m->get_source().num();
 
   assert(m->epoch % 2 == 1); // election
+  uint64_t required_features = mon->get_required_features();
   if ((required_features ^ m->get_connection()->get_features()) &
       required_features) {
     dout(5) << " ignoring propose from mon" << from
@@ -278,6 +278,13 @@ void Elector::handle_ack(MMonElection *m)
     return;
   }
   assert(m->epoch == epoch);
+  uint64_t required_features = mon->get_required_features();
+  if ((required_features ^ m->get_connection()->get_features()) &
+      required_features) {
+    dout(5) << " ignoring ack from mon" << from
+	    << " without required features" << dendl;
+    return;
+  }
   
   if (electing_me) {
     // thanks
@@ -349,7 +356,7 @@ void Elector::nak_old_peer(MMonElection *m)
   uint64_t supported_features = m->get_connection()->get_features();
 
   if (supported_features & CEPH_FEATURE_OSDMAP_ENC) {
-    uint64_t required_features = mon->apply_compatset_features_to_quorum_requirements();
+    uint64_t required_features = mon->get_required_features();
     dout(10) << "sending nak to peer " << m->get_source()
 	     << " that only supports " << supported_features
 	     << " of the required " << required_features << dendl;
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index b88e830..007fc11 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -115,8 +115,6 @@ class Elector {
    */
   map<int, uint64_t> acked_me;
   set<int> classic_mons;
-  /// features which a monitor must hold for us to defer to them
-  uint64_t required_features;
   /**
    * @}
    */
@@ -354,7 +352,6 @@ class Elector {
 			epoch(0),
 			participating(true),
 			electing_me(false),
-			required_features(0),
 			leader_acked(-1) { }
 
   /**
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index fcb622b..f30be1b 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -504,6 +504,7 @@ void MonClient::handle_auth(MAuthReply *m)
   if (ret == -EAGAIN) {
     MAuth *ma = new MAuth;
     ma->protocol = auth->get_protocol();
+    auth->prepare_build_request();
     ret = auth->build_request(ma->auth_payload);
     _send_mon_message(ma, true);
     return;
@@ -718,9 +719,6 @@ void MonClient::tick()
     }
   }
 
-  if (auth)
-    auth->tick();
-
   schedule_tick();
 }
 
@@ -780,6 +778,7 @@ int MonClient::_check_auth_tickets()
       ldout(cct, 10) << "_check_auth_tickets getting new tickets!" << dendl;
       MAuth *m = new MAuth;
       m->protocol = auth->get_protocol();
+      auth->prepare_build_request();
       auth->build_request(m->auth_payload);
       _send_mon_message(m);
     }
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index a8e138a..9fb83e2 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -163,31 +163,31 @@ COMMAND("auth print_key name=entity,type=CephString", "display requested key", \
 	"auth", "rx", "cli,rest")
 COMMAND("auth list", "list authentication state", "auth", "rx", "cli,rest")
 COMMAND("auth import", "auth import: read keyring file from -i <file>", \
-	"auth", "rw", "cli,rest")
+	"auth", "rwx", "cli,rest")
 COMMAND("auth add " \
 	"name=entity,type=CephString " \
 	"name=caps,type=CephString,n=N,req=false", \
 	"add auth info for <entity> from input file, or random key if no input given, and/or any caps specified in the command",
-	"auth", "rw", "cli,rest")
+	"auth", "rwx", "cli,rest")
 COMMAND("auth get-or-create-key " \
 	"name=entity,type=CephString " \
 	"name=caps,type=CephString,n=N,req=false", \
 	"get, or add, key for <name> from system/caps pairs specified in the command.  If key already exists, any given caps must match the existing caps for that key.", \
-	"auth", "rw", "cli,rest")
+	"auth", "rwx", "cli,rest")
 COMMAND("auth get-or-create " \
 	"name=entity,type=CephString " \
 	"name=caps,type=CephString,n=N,req=false", \
 	"add auth info for <entity> from input file, or random key if no input given, and/or any caps specified in the command", \
-	"auth", "rw", "cli,rest")
+	"auth", "rwx", "cli,rest")
 COMMAND("auth caps " \
 	"name=entity,type=CephString " \
 	"name=caps,type=CephString,n=N", \
 	"update caps for <name> from caps specified in the command", \
-	"auth", "rw", "cli,rest")
+	"auth", "rwx", "cli,rest")
 COMMAND("auth del " \
 	"name=entity,type=CephString", \
 	"delete all caps for <name>", \
-	"auth", "rw", "cli,rest")
+	"auth", "rwx", "cli,rest")
 
 /*
  * Monitor commands (Monitor.cc)
@@ -557,7 +557,8 @@ COMMAND("osd pool get " \
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
 	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age " \
-	"name=val,type=CephString", \
+	"name=val,type=CephString " \
+	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
 // 'val' is a CephString because it can include a unit.  Perhaps
 // there should be a Python type for validation/conversion of strings
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 7e29e14..0982327 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -156,6 +156,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   state(STATE_PROBING),
   
   elector(this),
+  required_features(0),
   leader(0),
   quorum_features(0),
   scrub_version(0),
@@ -197,6 +198,15 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   assert(r);
 
   exited_quorum = ceph_clock_now(g_ceph_context);
+
+  // assume our commands until we have an election.  this only means
+  // we won't reply with EINVAL before the election; any command that
+  // actually matters will wait until we have quorum etc and then
+  // retry (and revalidate).
+  const MonCommand *cmds;
+  int cmdsize;
+  get_locally_supported_monitor_commands(&cmds, &cmdsize);
+  set_leader_supported_commands(cmds, cmdsize);
 }
 
 PaxosService *Monitor::get_paxos_service_by_name(const string& name)
@@ -364,6 +374,9 @@ void Monitor::read_features()
 {
   read_features_off_disk(store, &features);
   dout(10) << "features " << features << dendl;
+
+  apply_compatset_features_to_quorum_requirements();
+  dout(10) << "required_features " << required_features << dendl;
 }
 
 void Monitor::write_features(MonitorDBStore::Transaction &t)
@@ -447,6 +460,16 @@ int Monitor::preinit()
       dout(10) << " monmap is " << *monmap << dendl;
       dout(10) << " extra probe peers " << extra_probe_peers << dendl;
     }
+  } else if (!monmap->contains(name)) {
+    derr << "not in monmap and have been in a quorum before; "
+         << "must have been removed" << dendl;
+    if (g_conf->mon_force_quorum_join) {
+      dout(0) << "we should have died but "
+              << "'mon_force_quorum_join' is set -- allowing boot" << dendl;
+    } else {
+      derr << "commit suicide!" << dendl;
+      return -ENOENT;
+    }
   }
 
   {
@@ -1085,6 +1108,16 @@ void Monitor::handle_sync_get_cookie(MMonSync *m)
 
   assert(g_conf->mon_sync_provider_kill_at != 1);
 
+  // make sure they can understand us.
+  if ((required_features ^ m->get_connection()->get_features()) &
+      required_features) {
+    dout(5) << " ignoring peer mon." << m->get_source().num()
+	    << " has features " << std::hex
+	    << m->get_connection()->get_features()
+	    << " but we require " << required_features << std::dec << dendl;
+    return;
+  }
+
   // make up a unique cookie.  include election epoch (which persists
   // across restarts for the whole cluster) and a counter for this
   // process instance.  there is no need to be unique *across*
@@ -1341,6 +1374,13 @@ void Monitor::handle_probe(MMonProbe *m)
     handle_probe_reply(m);
     break;
 
+  case MMonProbe::OP_MISSING_FEATURES:
+    derr << __func__ << " missing features, have " << CEPH_FEATURES_ALL
+	 << ", required " << required_features
+	 << ", missing " << (required_features & ~CEPH_FEATURES_ALL)
+	 << dendl;
+    break;
+
   default:
     m->put();
   }
@@ -1351,8 +1391,24 @@ void Monitor::handle_probe(MMonProbe *m)
  */
 void Monitor::handle_probe_probe(MMonProbe *m)
 {
-  dout(10) << "handle_probe_probe " << m->get_source_inst() << *m << dendl;
-  MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined);
+  dout(10) << "handle_probe_probe " << m->get_source_inst() << *m
+	   << " features " << m->get_connection()->get_features() << dendl;
+  uint64_t missing = required_features & ~m->get_connection()->get_features();
+  if (missing) {
+    dout(1) << " peer " << m->get_source_addr() << " missing features "
+	    << missing << dendl;
+    if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_PRIMARY_AFFINITY)) {
+      MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_MISSING_FEATURES,
+				   name, has_ever_joined);
+      m->required_features = required_features;
+      messenger->send_message(r, m->get_connection());
+    }
+    m->put();
+    return;
+  }
+
+  MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY,
+			       name, has_ever_joined);
   r->name = name;
   r->quorum = quorum;
   monmap->encode(r->monmap_bl, m->get_connection()->get_features());
@@ -1362,9 +1418,11 @@ void Monitor::handle_probe_probe(MMonProbe *m)
 
   // did we discover a peer here?
   if (!monmap->contains(m->get_source_addr())) {
-    dout(1) << " adding peer " << m->get_source_addr() << " to list of hints" << dendl;
+    dout(1) << " adding peer " << m->get_source_addr()
+	    << " to list of hints" << dendl;
     extra_probe_peers.insert(m->get_source_addr());
   }
+
   m->put();
 }
 
@@ -1661,24 +1719,27 @@ void Monitor::apply_quorum_to_compatset_features()
 
   if (new_features.compare(features) != 0) {
     CompatSet diff = features.unsupported(new_features);
-    dout(1) << "Enabling new quorum features: " << diff << dendl;
+    dout(1) << __func__ << " enabling new quorum features: " << diff << dendl;
     features = new_features;
+
     MonitorDBStore::Transaction t;
     write_features(t);
     store->apply_transaction(t);
+
+    apply_compatset_features_to_quorum_requirements();
   }
 }
 
-uint64_t Monitor::apply_compatset_features_to_quorum_requirements()
+void Monitor::apply_compatset_features_to_quorum_requirements()
 {
-  uint64_t required_features = 0;
+  required_features = 0;
   if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES)) {
     required_features |= CEPH_FEATURE_OSD_ERASURE_CODES;
   }
   if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC)) {
     required_features |= CEPH_FEATURE_OSDMAP_ENC;
   }
-  return required_features;
+  dout(10) << __func__ << " required_features " << required_features << dendl;
 }
 
 void Monitor::sync_force(Formatter *f, ostream& ss)
@@ -3485,6 +3546,7 @@ void Monitor::handle_subscribe(MMonSubscribe *m)
 void Monitor::handle_get_version(MMonGetVersion *m)
 {
   dout(10) << "handle_get_version " << *m << dendl;
+  PaxosService *svc = NULL;
 
   MonSession *s = static_cast<MonSession *>(m->get_connection()->get_priv());
   if (!s) {
@@ -3496,28 +3558,35 @@ void Monitor::handle_get_version(MMonGetVersion *m)
   if (!is_leader() && !is_peon()) {
     dout(10) << " waiting for quorum" << dendl;
     waitfor_quorum.push_back(new C_RetryMessage(this, m));
-    return;
+    goto out;
   }
 
-  MMonGetVersionReply *reply = new MMonGetVersionReply();
-  reply->handle = m->handle;
   if (m->what == "mdsmap") {
-    reply->version = mdsmon()->mdsmap.get_epoch();
-    reply->oldest_version = mdsmon()->get_first_committed();
+    svc = mdsmon();
   } else if (m->what == "osdmap") {
-    reply->version = osdmon()->osdmap.get_epoch();
-    reply->oldest_version = osdmon()->get_first_committed();
+    svc = osdmon();
   } else if (m->what == "monmap") {
-    reply->version = monmap->get_epoch();
-    reply->oldest_version = monmon()->get_first_committed();
+    svc = monmon();
   } else {
     derr << "invalid map type " << m->what << dendl;
   }
 
-  messenger->send_message(reply, m->get_source_inst());
+  if (svc) {
+    if (!svc->is_readable()) {
+      svc->wait_for_readable(new C_RetryMessage(this, m));
+      goto out;
+    }
+    MMonGetVersionReply *reply = new MMonGetVersionReply();
+    reply->handle = m->handle;
+    reply->version = svc->get_last_committed();
+    reply->oldest_version = svc->get_first_committed();
+    messenger->send_message(reply, m->get_source_inst());
+  }
 
-  s->put();
   m->put();
+
+ out:
+  s->put();
 }
 
 bool Monitor::ms_handle_reset(Connection *con)
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 8e04634..59292ec 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -195,6 +195,9 @@ private:
   Paxos *paxos;
   Elector elector;
   friend class Elector;
+
+  /// features we require of peers (based on on-disk compatset)
+  uint64_t required_features;
   
   int leader;            // current leader (to best of knowledge)
   set<int> quorum;       // current active set of monitors (if !starting)
@@ -525,8 +528,11 @@ public:
   uint64_t get_quorum_features() const {
     return quorum_features;
   }
+  uint64_t get_required_features() const {
+    return quorum_features;
+  }
   void apply_quorum_to_compatset_features();
-  uint64_t apply_compatset_features_to_quorum_requirements();
+  void apply_compatset_features_to_quorum_requirements();
 
 private:
   void _reset();   ///< called from bootstrap, start_, or join_election
diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc
index fc44e25..db21a94 100644
--- a/src/mon/MonitorStore.cc
+++ b/src/mon/MonitorStore.cc
@@ -246,7 +246,7 @@ bool MonitorStore::exists_bl_ss(const char *a, const char *b)
   struct stat st;
   int r = ::stat(fn, &st);
   //char buf[80];
-  //dout(15) << "exists_bl stat " << fn << " r=" << r << " errno " << errno << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+  //dout(15) << "exists_bl stat " << fn << " r=" << r << " " << cpp_strerror(errno) << dendl;
   if (r) {
     assert (errno == ENOENT);
   }
@@ -282,11 +282,10 @@ int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b)
   
   int fd = ::open(fn, O_RDONLY);
   if (fd < 0) {
-    char buf[80];
     if (b) {
-      dout(15) << "get_bl " << a << "/" << b << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+      dout(15) << "get_bl " << a << "/" << b << " " << cpp_strerror(errno) << dendl;
     } else {
-      dout(15) << "get_bl " << a << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+      dout(15) << "get_bl " << a << " " << cpp_strerror(errno) << dendl;
     }
     return -errno;
   }
@@ -304,10 +303,8 @@ int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b)
   while (off < len) {
     dout(20) << "reading at off " << off << " of " << len << dendl;
     int r = ::read(fd, bp.c_str()+off, len-off);
-    if (r < 0) {
-      char buf[80];
-      dout(0) << "errno on read " << strerror_r(errno, buf, sizeof(buf)) << dendl;
-    }
+    if (r < 0)
+      dout(0) << "errno on read " << cpp_strerror(errno) << dendl;
     assert(r>0);
     off += r;
   }
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 6b4abd5..5940724 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -21,6 +21,7 @@
 
 #include "common/Timer.h"
 #include "common/ceph_argparse.h"
+#include "common/errno.h"
 #include "mon/MDSMonitor.h"
 #include "mon/OSDMonitor.h"
 #include "mon/PGMonitor.h"
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 4de2b42..fa3d9cf 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1207,6 +1207,12 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
     goto ignore;
   }
 
+  if (osdmap.exists(from) &&
+      osdmap.get_info(from).up_from > m->version) {
+    dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
+    goto ignore;
+  }
+
   // noup?
   if (!can_mark_up(from)) {
     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
@@ -2056,6 +2062,29 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
 
+    // Warn if 'mon_osd_down_out_interval' is set to zero.
+    // Having this option set to zero on the leader acts much like the
+    // 'noout' flag.  It's hard to figure out what's going wrong with clusters
+    // without the 'noout' flag set but acting like that just the same, so
+    // we report a HEALTH_WARN in case this option is set to zero.
+    // This is an ugly hack to get the warning out, but until we find a way
+    // to spread global options throughout the mon cluster and have all mons
+    // using a base set of the same options, we need to work around this sort
+    // of things.
+    // There's also the obvious drawback that if this is set on a single
+    // monitor on a 3-monitor cluster, this warning will only be shown every
+    // third monitor connection.
+    if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
+        g_conf->mon_osd_down_out_interval == 0) {
+      ostringstream ss;
+      ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
+      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      if (detail) {
+        ss << "; this has the same effect as the 'noout' flag";
+        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+    }
+
     get_pools_health(summary, detail);
   }
 }
@@ -3318,6 +3347,13 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
 	return -EEXIST;
       return 0;
     }
+    string force;
+    cmd_getval(g_ceph_context,cmdmap, "force", force);
+    if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
+	force != "--yes-i-really-mean-it") {
+      ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
+      return -EPERM;
+    }
     int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
     int64_t new_pgs = n - p.get_pg_num();
     int64_t pgs_per_osd = new_pgs / expected_osds;
@@ -3333,7 +3369,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
 	++i) {
       if (i->m_pool == static_cast<uint64_t>(pool)) {
 	ss << "currently creating pgs, wait";
-	return -EAGAIN;
+	return -EBUSY;
       }
     }
     p.set_pg_num(n);
@@ -3355,7 +3391,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
 	++i) {
       if (i->m_pool == static_cast<uint64_t>(pool)) {
 	ss << "currently creating pgs, wait";
-	return -EAGAIN;
+	return -EBUSY;
       }
     }
     p.set_pgp_num(n);
@@ -3614,8 +3650,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 				       CRUSH_HASH_DEFAULT, type, 0, NULL,
 				       NULL, &bucketno);
     if (err < 0) {
-      char buf[128];
-      ss << "add_bucket error: '" << strerror_r(-err, buf, sizeof(buf)) << "'";
+      ss << "add_bucket error: '" << cpp_strerror(err) << "'";
       goto reply;
     }
     err = newcrush.set_item_name(bucketno, name);
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 764c4fe..b9e43a1 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -658,7 +658,7 @@ private:
    * onto the original Paxos' Prepare phase. Basically, we'll generate a
    * Proposal Number, taking @p oldpn into consideration, and we will send
    * it to a quorum, along with our first and last committed versions. By
-   * sending these informations in a message to the quorum, we expect to
+   * sending these information in a message to the quorum, we expect to
    * obtain acceptances from a majority, allowing us to commit, or be
    * informed of a higher Proposal Number known by one or more of the Peons
    * in the quorum.
@@ -676,7 +676,7 @@ private:
    * accordingly.
    *
    * Once a Peon receives a collect message from the Leader it will reply
-   * with its first and last committed versions, as well as informations so
+   * with its first and last committed versions, as well as information so
    * the Leader may know if his Proposal Number was, or was not, accepted by
    * the Peon. The Peon will accept the Leader's Proposal Number iif it is
    * higher than the Peon's currently accepted Proposal Number. The Peon may
diff --git a/src/msg/Accepter.cc b/src/msg/Accepter.cc
index d747ccb..718d478 100644
--- a/src/msg/Accepter.cc
+++ b/src/msg/Accepter.cc
@@ -58,9 +58,8 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
   /* socket creation */
   listen_sd = ::socket(family, SOCK_STREAM, 0);
   if (listen_sd < 0) {
-    char buf[80];
     lderr(msgr->cct) << "accepter.bind unable to create socket: "
-		     << strerror_r(errno, buf, sizeof(buf)) << dendl;
+		     << cpp_strerror(errno) << dendl;
     return -errno;
   }
 
@@ -84,9 +83,8 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
 
     rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
     if (rc < 0) {
-      char buf[80];
       lderr(msgr->cct) << "accepter.bind unable to bind to " << listen_addr.ss_addr()
-		       << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+		       << ": " << cpp_strerror(errno) << dendl;
       return -errno;
     }
   } else {
@@ -100,11 +98,10 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
 	break;
     }
     if (rc < 0) {
-      char buf[80];
       lderr(msgr->cct) << "accepter.bind unable to bind to " << listen_addr.ss_addr()
 		       << " on any port in range " << msgr->cct->_conf->ms_bind_port_min
 		       << "-" << msgr->cct->_conf->ms_bind_port_max
-		       << ": " << strerror_r(errno, buf, sizeof(buf))
+		       << ": " << cpp_strerror(errno)
 		       << dendl;
       return -errno;
     }
@@ -191,8 +188,6 @@ void *Accepter::entry()
   
   int errors = 0;
 
-  char buf[80];
-
   struct pollfd pfd;
   pfd.fd = listen_sd;
   pfd.events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
@@ -220,7 +215,7 @@ void *Accepter::entry()
       msgr->add_accept_pipe(sd);
     } else {
       ldout(msgr->cct,0) << "accepter no incoming connection?  sd = " << sd
-	      << " errno " << errno << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+	      << " errno " << errno << " " << cpp_strerror(errno) << dendl;
       if (++errors > 4)
 	break;
     }
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index 3d86789..0693c09 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -81,7 +81,6 @@ Pipe::Pipe(SimpleMessenger *r, int st, Connection *con)
     peer_type(-1),
     pipe_lock("SimpleMessenger::Pipe::pipe_lock"),
     state(st),
-    session_security(NULL),
     connection_state(NULL),
     reader_running(false), reader_needs_join(false),
     writer_running(false),
@@ -113,7 +112,6 @@ Pipe::~Pipe()
 {
   assert(out_q.empty());
   assert(sent.empty());
-  delete session_security;
   delete delay_thread;
 }
 
@@ -280,8 +278,7 @@ int Pipe::accept()
   len = sizeof(socket_addr.ss_addr());
   r = ::getpeername(sd, (sockaddr*)&socket_addr.ss_addr(), &len);
   if (r < 0) {
-    char buf[80];
-    ldout(msgr->cct,0) << "accept failed to getpeername " << errno << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+    ldout(msgr->cct,0) << "accept failed to getpeername " << cpp_strerror(errno) << dendl;
     goto fail_unlocked;
   }
   ::encode(socket_addr, addrs);
@@ -419,8 +416,7 @@ int Pipe::accept()
       if (state != STATE_ACCEPTING)
 	goto shutting_down_msgr_unlocked;
       reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER;
-      delete session_security;
-      session_security = NULL;
+      session_security.reset();
       goto reply;
     } 
 
@@ -658,9 +654,11 @@ int Pipe::accept()
   connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features);
   ldout(msgr->cct,10) << "accept features " << connection_state->get_features() << dendl;
 
-  delete session_security;
-  session_security = get_auth_session_handler(msgr->cct, connect.authorizer_protocol, session_key,
-					      connection_state->get_features());
+  session_security.reset(
+      get_auth_session_handler(msgr->cct,
+			       connect.authorizer_protocol,
+			       session_key,
+			       connection_state->get_features()));
 
   // notify
   msgr->dispatch_queue.queue_accept(connection_state.get());
@@ -818,12 +816,10 @@ int Pipe::connect()
   if (sd >= 0)
     ::close(sd);
 
-  char buf[80];
-
   // create socket?
   sd = ::socket(peer_addr.get_family(), SOCK_STREAM, 0);
   if (sd < 0) {
-    lderr(msgr->cct) << "connect couldn't created socket " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+    lderr(msgr->cct) << "connect couldn't created socket " << cpp_strerror(errno) << dendl;
     goto fail;
   }
 
@@ -832,7 +828,7 @@ int Pipe::connect()
   rc = ::connect(sd, (sockaddr*)&peer_addr.addr, peer_addr.addr_size());
   if (rc < 0) {
     ldout(msgr->cct,2) << "connect error " << peer_addr
-	     << ", " << errno << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+	     << ", " << cpp_strerror(errno) << dendl;
     goto fail;
   }
 
@@ -841,7 +837,7 @@ int Pipe::connect()
   // verify banner
   // FIXME: this should be non-blocking, or in some other way verify the banner as we get it.
   if (tcp_read((char*)&banner, strlen(CEPH_BANNER)) < 0) {
-    ldout(msgr->cct,2) << "connect couldn't read banner, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+    ldout(msgr->cct,2) << "connect couldn't read banner, " << cpp_strerror(errno) << dendl;
     goto fail;
   }
   if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
@@ -856,7 +852,7 @@ int Pipe::connect()
   msg.msg_iovlen = 1;
   msglen = msgvec[0].iov_len;
   if (do_sendmsg(&msg, msglen)) {
-    ldout(msgr->cct,2) << "connect couldn't write my banner, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+    ldout(msgr->cct,2) << "connect couldn't write my banner, " << cpp_strerror(errno) << dendl;
     goto fail;
   }
 
@@ -866,7 +862,7 @@ int Pipe::connect()
     addrbl.push_back(p);
   }
   if (tcp_read(addrbl.c_str(), addrbl.length()) < 0) {
-    ldout(msgr->cct,2) << "connect couldn't read peer addrs, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+    ldout(msgr->cct,2) << "connect couldn't read peer addrs, " << cpp_strerror(errno) << dendl;
     goto fail;
   }
   {
@@ -903,7 +899,7 @@ int Pipe::connect()
   msg.msg_iovlen = 1;
   msglen = msgvec[0].iov_len;
   if (do_sendmsg(&msg, msglen)) {
-    ldout(msgr->cct,2) << "connect couldn't write my addr, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+    ldout(msgr->cct,2) << "connect couldn't write my addr, " << cpp_strerror(errno) << dendl;
     goto fail;
   }
   ldout(msgr->cct,10) << "connect sent my addr " << msgr->my_inst.addr << dendl;
@@ -944,14 +940,14 @@ int Pipe::connect()
     ldout(msgr->cct,10) << "connect sending gseq=" << gseq << " cseq=" << cseq
 	     << " proto=" << connect.protocol_version << dendl;
     if (do_sendmsg(&msg, msglen)) {
-      ldout(msgr->cct,2) << "connect couldn't write gseq, cseq, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+      ldout(msgr->cct,2) << "connect couldn't write gseq, cseq, " << cpp_strerror(errno) << dendl;
       goto fail;
     }
 
     ldout(msgr->cct,20) << "connect wrote (self +) cseq, waiting for reply" << dendl;
     ceph_msg_connect_reply reply;
     if (tcp_read((char*)&reply, sizeof(reply)) < 0) {
-      ldout(msgr->cct,2) << "connect read reply " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+      ldout(msgr->cct,2) << "connect read reply " << cpp_strerror(errno) << dendl;
       goto fail;
     }
 
@@ -1099,13 +1095,15 @@ int Pipe::connect()
       // If we have an authorizer, get a new AuthSessionHandler to deal with ongoing security of the
       // connection.  PLR
 
-      delete session_security;
       if (authorizer != NULL) {
-        session_security = get_auth_session_handler(msgr->cct, authorizer->protocol, authorizer->session_key,
-						    connection_state->get_features());
+	session_security.reset(
+            get_auth_session_handler(msgr->cct,
+				     authorizer->protocol,
+				     authorizer->session_key,
+				     connection_state->get_features()));
       }  else {
         // We have no authorizer, so we shouldn't be applying security to messages in this pipe.  PLR
-	session_security = NULL;
+	session_security.reset();
       }
 
       msgr->dispatch_queue.queue_connect(connection_state.get());
@@ -1249,8 +1247,7 @@ void Pipe::fault(bool onread)
     return;
   }
   
-  char buf[80];
-  ldout(msgr->cct,2) << "fault " << errno << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+  ldout(msgr->cct,2) << "fault " << cpp_strerror(errno) << dendl;
 
   if (state == STATE_CLOSED ||
       state == STATE_CLOSING) {
@@ -1404,14 +1401,16 @@ void Pipe::reader()
       continue;
     }
 
+    // get a reference to the AuthSessionHandler while we have the pipe_lock
+    ceph::shared_ptr<AuthSessionHandler> auth_handler = session_security;
+
     pipe_lock.Unlock();
 
-    char buf[80];
     char tag = -1;
     ldout(msgr->cct,20) << "reader reading tag..." << dendl;
     if (tcp_read((char*)&tag, 1) < 0) {
       pipe_lock.Lock();
-      ldout(msgr->cct,2) << "reader couldn't read tag, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+      ldout(msgr->cct,2) << "reader couldn't read tag, " << cpp_strerror(errno) << dendl;
       fault(true);
       continue;
     }
@@ -1460,7 +1459,7 @@ void Pipe::reader()
       int rc = tcp_read((char*)&seq, sizeof(seq));
       pipe_lock.Lock();
       if (rc < 0) {
-	ldout(msgr->cct,2) << "reader couldn't read ack seq, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+	ldout(msgr->cct,2) << "reader couldn't read ack seq, " << cpp_strerror(errno) << dendl;
 	fault(true);
       } else if (state != STATE_CLOSED) {
         handle_ack(seq);
@@ -1471,7 +1470,7 @@ void Pipe::reader()
     else if (tag == CEPH_MSGR_TAG_MSG) {
       ldout(msgr->cct,20) << "reader got MSG" << dendl;
       Message *m = 0;
-      int r = read_message(&m);
+      int r = read_message(&m, auth_handler.get());
 
       pipe_lock.Lock();
       
@@ -1561,8 +1560,6 @@ void Pipe::reader()
  */
 void Pipe::writer()
 {
-  char buf[80];
-
   pipe_lock.Lock();
   while (state != STATE_CLOSED) {// && state != STATE_WAIT) {
     ldout(msgr->cct,10) << "writer: state = " << get_state_name()
@@ -1614,7 +1611,7 @@ void Pipe::writer()
 	pipe_lock.Lock();
 	if (rc < 0) {
 	  ldout(msgr->cct,2) << "writer couldn't write keepalive[2], "
-			     << strerror_r(errno, buf, sizeof(buf)) << dendl;
+			     << cpp_strerror(errno) << dendl;
 	  fault();
  	  continue;
 	}
@@ -1626,7 +1623,7 @@ void Pipe::writer()
 	int rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2_ACK, t);
 	pipe_lock.Lock();
 	if (rc < 0) {
-	  ldout(msgr->cct,2) << "writer couldn't write keepalive_ack, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+	  ldout(msgr->cct,2) << "writer couldn't write keepalive_ack, " << cpp_strerror(errno) << dendl;
 	  fault();
 	  continue;
 	}
@@ -1640,7 +1637,7 @@ void Pipe::writer()
 	int rc = write_ack(send_seq);
 	pipe_lock.Lock();
 	if (rc < 0) {
-	  ldout(msgr->cct,2) << "writer couldn't write ack, " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+	  ldout(msgr->cct,2) << "writer couldn't write ack, " << cpp_strerror(errno) << dendl;
 	  fault();
  	  continue;
 	}
@@ -1680,7 +1677,7 @@ void Pipe::writer()
 	// security set up.  Some session security options do not
 	// actually calculate and check the signature, but they should
 	// handle the calls to sign_message and check_signature.  PLR
-	if (session_security == NULL) {
+	if (session_security.get() == NULL) {
 	  ldout(msgr->cct, 20) << "writer no session security" << dendl;
 	} else {
 	  if (session_security->sign_message(m)) {
@@ -1704,7 +1701,7 @@ void Pipe::writer()
 	pipe_lock.Lock();
 	if (rc < 0) {
           ldout(msgr->cct,1) << "writer error sending " << m << ", "
-		  << errno << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+		  << cpp_strerror(errno) << dendl;
 	  fault();
         }
 	m->put();
@@ -1766,7 +1763,7 @@ static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off)
   }
 }
 
-int Pipe::read_message(Message **pm)
+int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
 {
   int ret = -1;
   // envelope
@@ -1954,10 +1951,10 @@ int Pipe::read_message(Message **pm)
   //  Check the signature if one should be present.  A zero return indicates success. PLR
   //
 
-  if (session_security == NULL) {
+  if (auth_handler == NULL) {
     ldout(msgr->cct, 10) << "No session security set" << dendl;
   } else {
-    if (session_security->check_message_signature(message)) {
+    if (auth_handler->check_message_signature(message)) {
       ldout(msgr->cct, 0) << "Signature check failed" << dendl;
       ret = -EINVAL;
       goto out_dethrottle;
@@ -2001,8 +1998,6 @@ int Pipe::read_message(Message **pm)
 
 int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more)
 {
-  char buf[80];
-
   while (len > 0) {
     if (0) { // sanity
       int l = 0;
@@ -2015,7 +2010,7 @@ int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more)
     if (r == 0) 
       ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
     if (r < 0) { 
-      ldout(msgr->cct,1) << "do_sendmsg error " << strerror_r(errno, buf, sizeof(buf)) << dendl;
+      ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(errno) << dendl;
       return -1;
     }
     if (state == STATE_CLOSED) {
@@ -2299,7 +2294,7 @@ again:
       goto again;
     } else {
       ldout(msgr->cct, 10) << "tcp_read_nonblocking socket " << sd << " returned "
-		     << got << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+		     << got << " " << cpp_strerror(errno) << dendl;
       return -1;
     }
   } else if (got == 0) {
@@ -2341,8 +2336,8 @@ int Pipe::tcp_write(const char *buf, int len)
   while (len > 0) {
     int did = ::send( sd, buf, len, MSG_NOSIGNAL );
     if (did < 0) {
-      //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << "  errno " << errno << " " << strerror(errno) << dendl;
-      //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << "  errno " << errno << " " << strerror(errno) << dendl;
+      //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
+      //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
       return did;
     }
     len -= did;
diff --git a/src/msg/Pipe.h b/src/msg/Pipe.h
index 29d7958..468a6a5 100644
--- a/src/msg/Pipe.h
+++ b/src/msg/Pipe.h
@@ -15,6 +15,8 @@
 #ifndef CEPH_MSGR_PIPE_H
 #define CEPH_MSGR_PIPE_H
 
+#include "include/memory.h"
+
 #include "msg_types.h"
 #include "Messenger.h"
 #include "auth/AuthSessionHandler.h"
@@ -146,7 +148,7 @@ class DispatchQueue;
 
     // session_security handles any signatures or encryptions required for this pipe's msgs. PLR
 
-    AuthSessionHandler *session_security;
+    ceph::shared_ptr<AuthSessionHandler> session_security;
 
   protected:
     friend class SimpleMessenger;
@@ -181,7 +183,8 @@ class DispatchQueue;
 
     int randomize_out_seq();
 
-    int read_message(Message **pm);
+    int read_message(Message **pm,
+		     AuthSessionHandler *session_security_copy);
     int write_message(ceph_msg_header& h, ceph_msg_footer& f, bufferlist& body);
     /**
      * Write the given data (of length len) to the Pipe's socket. This function
diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc
index 402fa3c..7616fe2 100644
--- a/src/os/JournalingObjectStore.cc
+++ b/src/os/JournalingObjectStore.cc
@@ -2,6 +2,7 @@
 
 #include "JournalingObjectStore.h"
 
+#include "common/errno.h"
 #include "common/debug.h"
 
 #define dout_subsys ceph_subsys_journal
@@ -25,6 +26,7 @@ void JournalingObjectStore::journal_stop()
     delete journal;
     journal = 0;
   }
+  apply_manager.reset();
 }
 
 int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
@@ -46,9 +48,8 @@ int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
 
   int err = journal->open(op_seq);
   if (err < 0) {
-    char buf[80];
     dout(3) << "journal_replay open failed with " 
-	    << strerror_r(-err, buf, sizeof(buf)) << dendl;
+	    << cpp_strerror(err) << dendl;
     delete journal;
     journal = 0;
     return err;
diff --git a/src/os/JournalingObjectStore.h b/src/os/JournalingObjectStore.h
index 946ab7c..fb7f0ec 100644
--- a/src/os/JournalingObjectStore.h
+++ b/src/os/JournalingObjectStore.h
@@ -68,6 +68,13 @@ protected:
       max_applied_seq(0),
       com_lock("JOS::ApplyManager::com_lock", false, true, false, g_ceph_context),
       committing_seq(0), committed_seq(0) {}
+    void reset() {
+      assert(open_ops == 0);
+      assert(blocked == false);
+      max_applied_seq = 0;
+      committing_seq = 0;
+      committed_seq = 0;
+    }
     void add_waiter(uint64_t, Context*);
     uint64_t op_apply_start(uint64_t op);
     void op_apply_finish(uint64_t op);
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index dc2a26f..66b7dd5 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -818,6 +818,7 @@ void ECBackend::handle_sub_write(
   clear_temp_objs(op.temp_removed);
   get_parent()->log_operation(
     op.log_entries,
+    op.updated_hit_set_history,
     op.trim_to,
     !(op.t.empty()),
     localt);
@@ -1201,6 +1202,7 @@ void ECBackend::submit_transaction(
   PGTransaction *_t,
   const eversion_t &trim_to,
   vector<pg_log_entry_t> &log_entries,
+  boost::optional<pg_hit_set_history_t> &hset_history,
   Context *on_local_applied_sync,
   Context *on_all_applied,
   Context *on_all_commit,
@@ -1215,6 +1217,7 @@ void ECBackend::submit_transaction(
   op->version = at_version;
   op->trim_to = trim_to;
   op->log_entries.swap(log_entries);
+  std::swap(op->updated_hit_set_history, hset_history);
   op->on_local_applied_sync = on_local_applied_sync;
   op->on_all_applied = on_all_applied;
   op->on_all_commit = on_all_commit;
@@ -1520,6 +1523,7 @@ void ECBackend::start_write(Op *op) {
       op->version,
       op->trim_to,
       op->log_entries,
+      op->updated_hit_set_history,
       op->temp_added,
       op->temp_cleared);
     if (*i == get_parent()->whoami_shard()) {
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index 0aa37c1..2061ea8 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -98,6 +98,7 @@ public:
     PGTransaction *t,
     const eversion_t &trim_to,
     vector<pg_log_entry_t> &log_entries,
+    boost::optional<pg_hit_set_history_t> &hset_history,
     Context *on_local_applied_sync,
     Context *on_all_applied,
     Context *on_all_commit,
@@ -326,6 +327,7 @@ public:
     eversion_t version;
     eversion_t trim_to;
     vector<pg_log_entry_t> log_entries;
+    boost::optional<pg_hit_set_history_t> updated_hit_set_history;
     Context *on_local_applied_sync;
     Context *on_all_applied;
     Context *on_all_commit;
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
index 87e622b..4e4c8e3 100644
--- a/src/osd/ECMsgTypes.cc
+++ b/src/osd/ECMsgTypes.cc
@@ -16,7 +16,7 @@
 
 void ECSubWrite::encode(bufferlist &bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(from, bl);
   ::encode(tid, bl);
   ::encode(reqid, bl);
@@ -28,12 +28,13 @@ void ECSubWrite::encode(bufferlist &bl) const
   ::encode(log_entries, bl);
   ::encode(temp_added, bl);
   ::encode(temp_removed, bl);
+  ::encode(updated_hit_set_history, bl);
   ENCODE_FINISH(bl);
 }
 
 void ECSubWrite::decode(bufferlist::iterator &bl)
 {
-  DECODE_START(1, bl);
+  DECODE_START(2, bl);
   ::decode(from, bl);
   ::decode(tid, bl);
   ::decode(reqid, bl);
@@ -45,17 +46,22 @@ void ECSubWrite::decode(bufferlist::iterator &bl)
   ::decode(log_entries, bl);
   ::decode(temp_added, bl);
   ::decode(temp_removed, bl);
+  if (struct_v >= 2) {
+    ::decode(updated_hit_set_history, bl);
+  }
   DECODE_FINISH(bl);
 }
 
 std::ostream &operator<<(
   std::ostream &lhs, const ECSubWrite &rhs)
 {
-  return lhs
-    << "ECSubWrite(tid=" << rhs.tid
-    << ", reqid=" << rhs.reqid
-    << ", at_version=" << rhs.at_version
-    << ", trim_to=" << rhs.trim_to << ")";
+  lhs << "ECSubWrite(tid=" << rhs.tid
+      << ", reqid=" << rhs.reqid
+      << ", at_version=" << rhs.at_version
+      << ", trim_to=" << rhs.trim_to;
+  if (rhs.updated_hit_set_history)
+    lhs << ", has_updated_hit_set_history";
+  return lhs <<  ")";
 }
 
 void ECSubWrite::dump(Formatter *f) const
@@ -64,6 +70,8 @@ void ECSubWrite::dump(Formatter *f) const
   f->dump_stream("reqid") << reqid;
   f->dump_stream("at_version") << at_version;
   f->dump_stream("trim_to") << trim_to;
+  f->dump_stream("has_updated_hit_set_history")
+    << static_cast<bool>(updated_hit_set_history);
 }
 
 void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o)
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
index d1ad2cb..11c519d 100644
--- a/src/osd/ECMsgTypes.h
+++ b/src/osd/ECMsgTypes.h
@@ -26,11 +26,12 @@ struct ECSubWrite {
   hobject_t soid;
   pg_stat_t stats;
   ObjectStore::Transaction t;
- eversion_t at_version;
+  eversion_t at_version;
   eversion_t trim_to;
   vector<pg_log_entry_t> log_entries;
   set<hobject_t> temp_added;
   set<hobject_t> temp_removed;
+  boost::optional<pg_hit_set_history_t> updated_hit_set_history;
   ECSubWrite() {}
   ECSubWrite(
     pg_shard_t from,
@@ -42,6 +43,7 @@ struct ECSubWrite {
     eversion_t at_version,
     eversion_t trim_to,
     vector<pg_log_entry_t> log_entries,
+    boost::optional<pg_hit_set_history_t> updated_hit_set_history,
     const set<hobject_t> &temp_added,
     const set<hobject_t> &temp_removed)
     : from(from), tid(tid), reqid(reqid),
@@ -49,7 +51,8 @@ struct ECSubWrite {
       at_version(at_version),
       trim_to(trim_to), log_entries(log_entries),
       temp_added(temp_added),
-      temp_removed(temp_removed) {}
+      temp_removed(temp_removed),
+      updated_hit_set_history(updated_hit_set_history) {}
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index baa9848..2d63790 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2156,6 +2156,7 @@ struct pistate {
   vector<int> old_acting, old_up;
   epoch_t same_interval_since;
   int primary;
+  int up_primary;
 };
 
 void OSD::build_past_intervals_parallel()
@@ -2207,9 +2208,10 @@ void OSD::build_past_intervals_parallel()
 	continue;
 
       vector<int> acting, up;
+      int up_primary;
       int primary;
       cur_map->pg_to_up_acting_osds(
-	pg->info.pgid.pgid, &up, 0, &acting, &primary);
+	pg->info.pgid.pgid, &up, &up_primary, &acting, &primary);
 
       if (p.same_interval_since == 0) {
 	dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
@@ -2219,6 +2221,7 @@ void OSD::build_past_intervals_parallel()
 	p.old_up = up;
 	p.old_acting = acting;
 	p.primary = primary;
+	p.up_primary = up_primary;
 	continue;
       }
       assert(last_map);
@@ -2228,6 +2231,8 @@ void OSD::build_past_intervals_parallel()
 	p.primary,
 	primary,
 	p.old_acting, acting,
+	p.up_primary,
+	up_primary,
 	p.old_up, up,
 	p.same_interval_since,
 	pg->info.history.last_epoch_clean,
@@ -3852,7 +3857,10 @@ void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epo
     m->put();
     return;
   }
-  osd->cluster_messenger->send_message(m, next_osdmap->get_cluster_inst(peer));
+  const entity_inst_t& peer_inst = next_osdmap->get_cluster_inst(peer);
+  Connection *peer_con = osd->cluster_messenger->get_connection(peer_inst).get();
+  osd->_share_map_outgoing(peer, peer_con, next_osdmap);
+  osd->cluster_messenger->send_message(m, peer_inst);
 }
 
 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
@@ -4276,10 +4284,23 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
 	  _have_pg(pcand)) {
 	PG *pg = _lookup_lock_pg(pcand);
 	assert(pg);
-	// simulate pg <pgid> cmd= for pg->do-command
-	if (prefix != "pg")
-	  cmd_putval(cct, cmdmap, "cmd", prefix);
-	r = pg->do_command(cmdmap, ss, data, odata);
+	if (pg->is_primary()) {
+	  // simulate pg <pgid> cmd= for pg->do-command
+	  if (prefix != "pg")
+	    cmd_putval(cct, cmdmap, "cmd", prefix);
+	  r = pg->do_command(cmdmap, ss, data, odata);
+	} else {
+	  ss << "not primary for pgid " << pgid;
+
+	  // send them the latest diff to ensure they realize the mapping
+	  // has changed.
+	  send_incremental_map(osdmap->get_epoch() - 1, con);
+
+	  // do not reply; they will get newer maps and realize they
+	  // need to resend.
+	  pg->unlock();
+	  return;
+	}
 	pg->unlock();
       } else {
 	ss << "i don't have pgid " << pgid;
@@ -5664,9 +5685,9 @@ void OSD::check_osdmap_features(ObjectStore *fs)
 	!fs->get_allow_sharded_objects()) {
     dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
-    ObjectStore::Transaction t;
-    write_superblock(t);
-    int err = store->apply_transaction(t);
+    ObjectStore::Transaction *t = new ObjectStore::Transaction;
+    write_superblock(*t);
+    int err = store->queue_transaction_and_cleanup(NULL, t);
     assert(err == 0);
     fs->set_allow_sharded_objects();
   }
@@ -6144,9 +6165,15 @@ void OSD::split_pgs(
   parent->update_snap_mapper_bits(
     parent->info.pgid.get_split_bits(pg_num)
     );
+
+  vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
+  parent->info.stats.stats.sum.split(updated_stats);
+
+  vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
   for (set<spg_t>::const_iterator i = childpgids.begin();
        i != childpgids.end();
-       ++i) {
+       ++i, ++stat_iter) {
+    assert(stat_iter != updated_stats.end());
     dout(10) << "Splitting " << *parent << " into " << *i << dendl;
     assert(service.splitting(*i));
     PG* child = _make_pg(nextmap, *i);
@@ -6167,10 +6194,13 @@ void OSD::split_pgs(
       i->pgid,
       child,
       split_bits);
+    child->info.stats.stats.sum = *stat_iter;
 
     child->write_if_dirty(*(rctx->transaction));
     child->unlock();
   }
+  assert(stat_iter != updated_stats.end());
+  parent->info.stats.stats.sum = *stat_iter;
   parent->write_if_dirty(*(rctx->transaction));
 }
   
@@ -6452,7 +6482,7 @@ void OSD::do_notifies(
       cluster_messenger->send_message(m, con.get());
     } else {
       dout(7) << "do_notify osd " << it->first
-	      << " sending seperate messages" << dendl;
+	      << " sending separate messages" << dendl;
       for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i =
 	     it->second.begin();
 	   i != it->second.end();
@@ -6491,7 +6521,7 @@ void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
       cluster_messenger->send_message(m, con.get());
     } else {
       dout(7) << "do_queries querying osd." << who
-	      << " sending seperate messages "
+	      << " sending saperate messages "
 	      << " on " << pit->second.size() << " PGs" << dendl;
       for (map<spg_t, pg_query_t>::iterator i = pit->second.begin();
 	   i != pit->second.end();
@@ -7273,10 +7303,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
   MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   assert(m->get_header().type == CEPH_MSG_OSD_OP);
 
-  if (m->get_map_epoch() < pg->info.history.same_primary_since) {
-    dout(7) << *pg << " changed after " << m->get_map_epoch() << ", dropping" << dendl;
-    return;
-  }
+  assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
 
   if (pg->is_ec_pg()) {
     /**
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 9192fcc..6b3c89d 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -334,7 +334,7 @@ public:
   void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
 
   // -- superblock --
-  Mutex publish_lock, pre_publish_lock;
+  Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
   OSDSuperblock superblock;
   OSDSuperblock get_superblock() {
     Mutex::Locker l(publish_lock);
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index a764f34..fc61d43 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -2426,9 +2426,8 @@ void OSDMap::print_summary(Formatter *f, ostream& out) const
     f->dump_int("num_osds", get_num_osds());
     f->dump_int("num_up_osds", get_num_up_osds());
     f->dump_int("num_in_osds", get_num_in_osds());
-    f->dump_string("full", test_flag(CEPH_OSDMAP_FULL) ? "true" : "false");
-    f->dump_string("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ?
-		   "true" : "false");
+    f->dump_bool("full", test_flag(CEPH_OSDMAP_FULL) ? true : false);
+    f->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ? true : false);
     f->close_section();
   } else {
     out << "     osdmap e" << get_epoch() << ": "
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 49503ef..924554c 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -545,7 +545,9 @@ bool PG::needs_recovery() const
   const pg_missing_t &missing = pg_log.get_missing();
 
   if (missing.num_missing()) {
-    dout(10) << __func__ << " primary has " << missing.num_missing() << dendl;
+    dout(10) << __func__ << " primary has " << missing.num_missing()
+      << " missing" << dendl;
+
     ret = true;
   }
 
@@ -558,12 +560,14 @@ bool PG::needs_recovery() const
     pg_shard_t peer = *a;
     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
     if (pm == peer_missing.end()) {
-      dout(10) << __func__ << " osd." << peer << " don't have missing set" << dendl;
+      dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
+        << dendl;
       ret = true;
       continue;
     }
     if (pm->second.num_missing()) {
-      dout(10) << __func__ << " osd." << peer << " has " << pm->second.num_missing() << " missing" << dendl;
+      dout(10) << __func__ << " osd." << peer << " has "
+        << pm->second.num_missing() << " missing" << dendl;
       ret = true;
     }
   }
@@ -634,24 +638,28 @@ void PG::generate_past_intervals()
 
   OSDMapRef last_map, cur_map;
   int primary = -1;
+  int up_primary = -1;
   vector<int> acting, up, old_acting, old_up;
 
   cur_map = osd->get_map(cur_epoch);
   cur_map->pg_to_up_acting_osds(
-    get_pgid().pgid, &up, 0, &acting, &primary);
+    get_pgid().pgid, &up, &up_primary, &acting, &primary);
   epoch_t same_interval_since = cur_epoch;
   dout(10) << __func__ << " over epochs " << cur_epoch << "-"
 	   << end_epoch << dendl;
   ++cur_epoch;
   for (; cur_epoch <= end_epoch; ++cur_epoch) {
     int old_primary = primary;
+    int old_up_primary = up_primary;
     last_map.swap(cur_map);
     old_up.swap(up);
     old_acting.swap(acting);
 
     cur_map = osd->get_map(cur_epoch);
-    cur_map->pg_to_up_acting_osds(
-      get_pgid().pgid, &up, 0, &acting, &primary);
+    pg_t pgid = get_pgid().pgid;
+    if (cur_map->get_pools().count(pgid.pool()))
+      pgid = pgid.get_ancestor(cur_map->get_pg_num(pgid.pool()));
+    cur_map->pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &primary);
 
     std::stringstream debug;
     bool new_interval = pg_interval_t::check_new_interval(
@@ -659,14 +667,16 @@ void PG::generate_past_intervals()
       primary,
       old_acting,
       acting,
+      old_up_primary,
+      up_primary,
       old_up,
       up,
       same_interval_since,
       info.history.last_epoch_clean,
       cur_map,
       last_map,
-      info.pgid.pool(),
-      info.pgid.pgid,
+      pgid.pool(),
+      pgid,
       &past_intervals,
       &debug);
     if (new_interval) {
@@ -802,7 +812,7 @@ void PG::build_prior(std::auto_ptr<PriorSet> &prior_set)
   set_probe_targets(prior_set->probe);
 }
 
-void PG::clear_primary_state(bool staying_primary)
+void PG::clear_primary_state()
 {
   dout(10) << "clear_primary_state" << dendl;
 
@@ -836,8 +846,7 @@ void PG::clear_primary_state(bool staying_primary)
   osd->recovery_wq.dequeue(this);
   osd->snap_trim_wq.dequeue(this);
 
-  if (!staying_primary)
-    agent_clear();
+  agent_clear();
 
   osd->remove_want_pg_temp(info.pgid.pgid);
 }
@@ -1533,6 +1542,7 @@ void PG::activate(ObjectStore::Transaction& t,
 	pi.last_complete = info.last_update;
 	pi.last_backfill = hobject_t();
 	pi.history = info.history;
+	pi.hit_set = info.hit_set;
 	pi.stats.stats.clear();
 
 	m = new MOSDPGLog(
@@ -1794,6 +1804,7 @@ void PG::all_activated_and_committed()
 
   // info.last_epoch_started is set during activate()
   info.history.last_epoch_started = info.last_epoch_started;
+  state_clear(PG_STATE_CREATING);
 
   share_pg_info();
   publish_stats_to_osd();
@@ -2856,18 +2867,18 @@ void PG::update_snap_map(
 /**
  * filter trimming|trimmed snaps out of snapcontext
  */
-void PG::filter_snapc(SnapContext& snapc)
+void PG::filter_snapc(vector<snapid_t> &snaps)
 {
   bool filtering = false;
   vector<snapid_t> newsnaps;
-  for (vector<snapid_t>::iterator p = snapc.snaps.begin();
-       p != snapc.snaps.end();
+  for (vector<snapid_t>::iterator p = snaps.begin();
+       p != snaps.end();
        ++p) {
     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
       if (!filtering) {
 	// start building a new vector with what we've seen so far
-	dout(10) << "filter_snapc filtering " << snapc << dendl;
-	newsnaps.insert(newsnaps.begin(), snapc.snaps.begin(), p);
+	dout(10) << "filter_snapc filtering " << snaps << dendl;
+	newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
 	filtering = true;
       }
       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
@@ -2877,8 +2888,8 @@ void PG::filter_snapc(SnapContext& snapc)
     }
   }
   if (filtering) {
-    snapc.snaps.swap(newsnaps);
-    dout(10) << "filter_snapc  result " << snapc << dendl;
+    snaps.swap(newsnaps);
+    dout(10) << "filter_snapc  result " << snaps << dendl;
   }
 }
 
@@ -3445,6 +3456,7 @@ void PG::repair_object(
     assert(waiting_for_unreadable_object.empty());
 
     pg_log.missing_add(soid, oi.version, eversion_t());
+    missing_loc.add_missing(soid, oi.version, eversion_t());
     missing_loc.add_location(soid, ok_peer);
 
     pg_log.set_last_requested(0);
@@ -4662,6 +4674,12 @@ void PG::start_peering_interval(
 
   reg_next_scrub();
 
+  // set CREATING bit until we have peered for the first time.
+  if (is_primary() && info.history.last_epoch_started == 0)
+    state_set(PG_STATE_CREATING);
+  else
+    state_clear(PG_STATE_CREATING);
+
   // did acting, up, primary|acker change?
   if (!lastmap) {
     dout(10) << " no lastmap" << dendl;
@@ -4673,6 +4691,8 @@ void PG::start_peering_interval(
       old_acting_primary.osd,
       new_acting_primary,
       oldacting, newacting,
+      old_up_primary.osd,
+      new_up_primary,
       oldup, newup,
       info.history.same_interval_since,
       info.history.last_epoch_clean,
@@ -4725,7 +4745,7 @@ void PG::start_peering_interval(
 
   // reset primary state?
   if (was_old_primary || is_primary())
-    clear_primary_state(was_old_primary && is_primary());
+    clear_primary_state();
 
     
   // pg->on_*
@@ -4910,16 +4930,31 @@ bool PG::can_discard_op(OpRequestRef op)
   if (OSD::op_is_discardable(m)) {
     dout(20) << " discard " << *m << dendl;
     return true;
-  } else if ((op->may_write() || op->may_cache()) &&
-	     (!is_primary() ||
-	      !same_for_modify_since(m->get_map_epoch()))) {
-    osd->handle_misdirected_op(this, op);
-    return true;
-  } else if (op->may_read() &&
-	     !same_for_read_since(m->get_map_epoch())) {
-    osd->handle_misdirected_op(this, op);
+  }
+
+  if (m->get_map_epoch() < info.history.same_primary_since) {
+    dout(7) << " changed after " << m->get_map_epoch()
+	    << ", dropping " << *m << dendl;
     return true;
-  } else if (is_replay()) {
+  }
+
+  if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+			 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+      op->may_read() &&
+      !(op->may_write() || op->may_cache())) {
+    // balanced reads; any replica will do
+    if (!(is_primary() || is_replica())) {
+      osd->handle_misdirected_op(this, op);
+      return true;
+    }
+  } else {
+    // normal case; must be primary
+    if (!is_primary()) {
+      osd->handle_misdirected_op(this, op);
+      return true;
+    }
+  }
+  if (is_replay()) {
     if (m->get_version().version > 0) {
       dout(7) << " queueing replay at " << m->get_version()
 	      << " for " << *m << dendl;
@@ -6565,6 +6600,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoev
     ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
     pg->rewind_divergent_log(*t, infoevt.info.last_update);
     pg->info.stats = infoevt.info.stats;
+    pg->info.hit_set = infoevt.info.hit_set;
   }
   
   assert(infoevt.info.last_update == pg->info.last_update);
@@ -6884,7 +6920,7 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt
 	     << "non-auth_log_shard osd." << logevt.from << dendl;
     return discard_event();
   }
-  dout(10) << "GetLog: recieved master log from osd" 
+  dout(10) << "GetLog: received master log from osd"
 	   << logevt.from << dendl;
   msg = logevt.msg;
   post_event(GotLog());
diff --git a/src/osd/PG.h b/src/osd/PG.h
index f6873ad..fa5bccd 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -379,6 +379,10 @@ public:
 	}
       }
     }
+
+    void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
+      needs_recovery_map[hoid] = pg_missing_t::item(need, have);
+    }
     void revise_need(const hobject_t &hoid, eversion_t need) {
       assert(needs_recovery(hoid));
       needs_recovery_map[hoid].need = need;
@@ -678,7 +682,7 @@ protected:
   void clear_publish_stats();
 
 public:
-  void clear_primary_state(bool stay_primary);
+  void clear_primary_state();
 
  public:
   bool is_actingbackfill(pg_shard_t osd) const {
@@ -1999,7 +2003,7 @@ public:
     vector<pg_log_entry_t> &log_entries,
     ObjectStore::Transaction& t);
 
-  void filter_snapc(SnapContext& snapc);
+  void filter_snapc(vector<snapid_t> &snaps);
 
   void log_weirdness();
 
@@ -2107,10 +2111,6 @@ public:
   virtual int do_command(cmdmap_t cmdmap, ostream& ss,
 			 bufferlist& idata, bufferlist& odata) = 0;
 
-  virtual bool same_for_read_since(epoch_t e) = 0;
-  virtual bool same_for_modify_since(epoch_t e) = 0;
-  virtual bool same_for_rep_modify_since(epoch_t e) = 0;
-
   virtual void on_role_change() = 0;
   virtual void on_pool_change() = 0;
   virtual void on_change(ObjectStore::Transaction *t) = 0;
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index bd18a47..1dbf20d 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -175,6 +175,7 @@
 
      virtual void log_operation(
        vector<pg_log_entry_t> &logv,
+       boost::optional<pg_hit_set_history_t> &hset_history,
        const eversion_t &trim_to,
        bool transaction_applied,
        ObjectStore::Transaction *t) = 0;
@@ -492,6 +493,8 @@
      PGTransaction *t,                    ///< [in] trans to execute
      const eversion_t &trim_to,           ///< [in] trim log to here
      vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
+     /// [in] hitset history (if updated with this transaction)
+     boost::optional<pg_hit_set_history_t> &hset_history,
      Context *on_local_applied_sync,      ///< [in] called when applied locally
      Context *on_all_applied,             ///< [in] called when all acked
      Context *on_all_commit,              ///< [in] called when all commit
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index c58a1c4..c3addd7 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -552,6 +552,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
   }
   if (info.last_backfill.is_max())
     info.stats = oinfo.stats;
+  info.hit_set = oinfo.hit_set;
 
   // do we have divergent entries to throw out?
   if (olog.head < log.head) {
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 8856461..5a9668f 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -494,6 +494,7 @@ void ReplicatedBackend::submit_transaction(
   PGTransaction *_t,
   const eversion_t &trim_to,
   vector<pg_log_entry_t> &log_entries,
+  boost::optional<pg_hit_set_history_t> &hset_history,
   Context *on_local_applied_sync,
   Context *on_all_acked,
   Context *on_all_commit,
@@ -536,6 +537,7 @@ void ReplicatedBackend::submit_transaction(
     t->get_temp_cleared().size() ?
       *(t->get_temp_cleared().begin()) :hobject_t(),
     log_entries,
+    hset_history,
     &op,
     op_t);
 
@@ -546,7 +548,7 @@ void ReplicatedBackend::submit_transaction(
   }
   clear_temp_objs(t->get_temp_cleared());
 
-  parent->log_operation(log_entries, trim_to, true, &local_t);
+  parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
   local_t.append(*op_t);
   local_t.swap(*op_t);
   
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index cfad2ed..371574b 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -343,6 +343,7 @@ public:
     PGTransaction *t,
     const eversion_t &trim_to,
     vector<pg_log_entry_t> &log_entries,
+    boost::optional<pg_hit_set_history_t> &hset_history,
     Context *on_local_applied_sync,
     Context *on_all_applied,
     Context *on_all_commit,
@@ -361,6 +362,7 @@ private:
     hobject_t new_temp_oid,
     hobject_t discard_temp_oid,
     vector<pg_log_entry_t> &log_entries,
+    boost::optional<pg_hit_set_history_t> &hset_history,
     InProgressOp *op,
     ObjectStore::Transaction *op_t);
   void op_applied(InProgressOp *op);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 1b798c1..9a4f12f 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -368,24 +368,6 @@ PerfCounters *ReplicatedPG::get_logger()
   return osd->logger;
 }
 
-// =======================
-// pg changes
-
-bool ReplicatedPG::same_for_read_since(epoch_t e)
-{
-  return (e >= info.history.same_primary_since);
-}
-
-bool ReplicatedPG::same_for_modify_since(epoch_t e)
-{
-  return (e >= info.history.same_primary_since);
-}
-
-bool ReplicatedPG::same_for_rep_modify_since(epoch_t e)
-{
-  // check osd map: same set, or primary+acker?
-  return e >= info.history.same_primary_since;
-}
 
 // ====================
 // missing objects
@@ -490,6 +472,30 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
   op->mark_delayed("waiting for degraded object");
 }
 
+bool ReplicatedPG::maybe_await_blocked_snapset(
+  const hobject_t &hoid,
+  OpRequestRef op)
+{
+  ObjectContextRef obc;
+  if (obc = object_contexts.lookup(hoid.get_head())) {
+    if (obc->is_blocked()) {
+      wait_for_blocked_object(obc->obs.oi.soid, op);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  if (obc = object_contexts.lookup(hoid.get_snapdir())) {
+    if (obc->is_blocked()) {
+      wait_for_blocked_object(obc->obs.oi.soid, op);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return false;
+}
+
 void ReplicatedPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 {
   dout(10) << __func__ << " " << soid << " " << op << dendl;
@@ -634,6 +640,11 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
     handle_query_state(f.get());
     f->close_section();
 
+    f->open_object_section("agent_state");
+    if (agent_state)
+      agent_state->dump(f.get());
+    f->close_section();
+
     f->close_section();
     f->flush(odata);
     return 0;
@@ -964,6 +975,10 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	    result = -EOPNOTSUPP;
 	    break;
 	  }
+	  if (is_unreadable_object(oid)) {
+	    wait_for_unreadable_object(oid, op);
+	    return;
+	  }
 	  result = osd->store->read(coll, oid, 0, 0, osd_op.outdata);
 	}
       }
@@ -1233,6 +1248,35 @@ void ReplicatedPG::do_op(OpRequestRef op)
     return;
   }
 
+  // dup/replay?
+  if (op->may_write() || op->may_cache()) {
+    const pg_log_entry_t *entry = pg_log.get_log().get_request(m->get_reqid());
+    if (entry) {
+      const eversion_t& oldv = entry->version;
+      dout(3) << __func__ << " dup " << m->get_reqid()
+	      << " was " << oldv << dendl;
+      if (already_complete(oldv)) {
+	osd->reply_op_error(op, 0, oldv, entry->user_version);
+      } else {
+	if (m->wants_ack()) {
+	  if (already_ack(oldv)) {
+	    MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
+	    reply->add_flags(CEPH_OSD_FLAG_ACK);
+	    reply->set_reply_versions(oldv, entry->user_version);
+	    osd->send_message_osd_client(reply, m->get_connection());
+	  } else {
+	    dout(10) << " waiting for " << oldv << " to ack" << dendl;
+	    waiting_for_ack[oldv].push_back(op);
+	  }
+	}
+	dout(10) << " waiting for " << oldv << " to commit" << dendl;
+	waiting_for_ondisk[oldv].push_back(op);  // always queue ondisk waiters, so that we can requeue if needed
+	op->mark_delayed("waiting for ondisk");
+      }
+      return;
+    }
+  }
+
   ObjectContextRef obc;
   bool can_create = op->may_write() || op->may_cache();
   hobject_t missing_oid;
@@ -1242,7 +1286,17 @@ void ReplicatedPG::do_op(OpRequestRef op)
 		m->get_pg().ps(),
 		m->get_object_locator().get_pool(),
 		m->get_object_locator().nspace);
-  int r = find_object_context(oid, &obc, can_create, &missing_oid);
+
+  // io blocked on obc?
+  if (((m->get_flags() & CEPH_OSD_FLAG_FLUSH) == 0) &&
+      maybe_await_blocked_snapset(oid, op)) {
+    return;
+  }
+
+  int r = find_object_context(
+    oid, &obc, can_create,
+    m->get_flags() & CEPH_OSD_FLAG_MAP_SNAP_CLONE,
+    &missing_oid);
 
   if (r == -EAGAIN) {
     // If we're not the primary of this OSD, and we have
@@ -1256,11 +1310,23 @@ void ReplicatedPG::do_op(OpRequestRef op)
       wait_for_unreadable_object(missing_oid, op);
       return;
     }
-  } else if (r == 0 && is_unreadable_object(obc->obs.oi.soid)) {
-    dout(10) << __func__ << ": clone " << obc->obs.oi.soid
-	     << " is unreadable, waiting" << dendl;
-    wait_for_unreadable_object(obc->obs.oi.soid, op);
-    return;
+  } else if (r == 0) {
+    if (is_unreadable_object(obc->obs.oi.soid)) {
+      dout(10) << __func__ << ": clone " << obc->obs.oi.soid
+	       << " is unreadable, waiting" << dendl;
+      wait_for_unreadable_object(obc->obs.oi.soid, op);
+      return;
+    }
+
+    // degraded object?  (the check above was for head; this could be a clone)
+    if (write_ordered &&
+	obc->obs.oi.soid.snap != CEPH_NOSNAP &&
+	is_degraded_object(obc->obs.oi.soid)) {
+      dout(10) << __func__ << ": clone " << obc->obs.oi.soid
+	       << " is degraded, waiting" << dendl;
+      wait_for_degraded_object(obc->obs.oi.soid, op);
+      return;
+    }
   }
 
   if (hit_set) {
@@ -1338,7 +1404,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
 	if (src_oid.is_head() && is_missing_object(src_oid)) {
 	  wait_for_unreadable_object(src_oid, op);
 	} else if ((r = find_object_context(
-		      src_oid, &sobc, false, &wait_oid)) == -EAGAIN) {
+		      src_oid, &sobc, false, false,
+		      &wait_oid)) == -EAGAIN) {
 	  // missing the specific snap we need; requeue and wait.
 	  wait_for_unreadable_object(wait_oid, op);
 	} else if (r) {
@@ -1389,27 +1456,25 @@ void ReplicatedPG::do_op(OpRequestRef op)
     for (vector<snapid_t>::iterator p = obc->ssc->snapset.clones.begin();
 	 p != obc->ssc->snapset.clones.end();
 	 ++p) {
-      object_locator_t src_oloc;
-      get_src_oloc(m->get_oid(), m->get_object_locator(), src_oloc);
       hobject_t clone_oid = obc->obs.oi.soid;
       clone_oid.snap = *p;
       if (!src_obc.count(clone_oid)) {
-	ObjectContextRef sobc;
-	hobject_t wait_oid;
+	if (is_unreadable_object(clone_oid)) {
+	  wait_for_unreadable_object(clone_oid, op);
+	  return;
+	}
 
-	int r = find_object_context(clone_oid, &sobc, false, &wait_oid);
-	if (r == -EAGAIN) {
-	  // missing the specific snap we need; requeue and wait.
-	  wait_for_unreadable_object(wait_oid, op);
-	} else if (r) {
-	  if (!maybe_handle_cache(op, write_ordered, sobc, r, wait_oid, true))
-	    osd->reply_op_error(op, r);
+	ObjectContextRef sobc = get_object_context(clone_oid, false);
+	if (!sobc) {
+	  if (!maybe_handle_cache(op, write_ordered, sobc, -ENOENT, clone_oid, true))
+	    osd->reply_op_error(op, -ENOENT);
+	  return;
 	} else {
 	  dout(10) << " clone_oid " << clone_oid << " obc " << sobc << dendl;
 	  src_obc[clone_oid] = sobc;
 	  continue;
 	}
-	return;
+	assert(0); // unreachable
       } else {
 	continue;
       }
@@ -1610,7 +1675,8 @@ void ReplicatedPG::promote_object(OpRequestRef op, ObjectContextRef obc,
   oloc.pool = pool.info.tier_of;
   start_copy(cb, obc, obc->obs.oi.soid, oloc, 0,
 	     CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
-	     CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE,
+	     CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+	     CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
 	     obc->obs.oi.soid.snap == CEPH_NOSNAP);
 
   assert(obc->is_blocked());
@@ -1633,34 +1699,6 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
   ctx->op_t = pgbackend->get_transaction();
 
   if (op->may_write() || op->may_cache()) {
-    // dup/replay?
-    const pg_log_entry_t *entry = pg_log.get_log().get_request(ctx->reqid);
-    if (entry) {
-      const eversion_t& oldv = entry->version;
-      dout(3) << "do_op dup " << ctx->reqid << " was " << oldv << dendl;
-      if (already_complete(oldv)) {
-	reply_ctx(ctx, 0, oldv, entry->user_version);
-      } else {
-	close_op_ctx(ctx, -EBUSY);
-
-	if (m->wants_ack()) {
-	  if (already_ack(oldv)) {
-	    MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
-	    reply->add_flags(CEPH_OSD_FLAG_ACK);
-	    reply->set_reply_versions(oldv, entry->user_version);
-	    osd->send_message_osd_client(reply, m->get_connection());
-	  } else {
-	    dout(10) << " waiting for " << oldv << " to ack" << dendl;
-	    waiting_for_ack[oldv].push_back(op);
-	  }
-	}
-	dout(10) << " waiting for " << oldv << " to commit" << dendl;
-	waiting_for_ondisk[oldv].push_back(op);  // always queue ondisk waiters, so that we can requeue if needed
-	op->mark_delayed("waiting for ondisk");
-      }
-      return;
-    }
-
     op->mark_started();
 
     // snap
@@ -2228,18 +2266,34 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
     derr << __func__ << "could not find coid " << coid << dendl;
     assert(0);
   }
+  assert(obc->ssc);
+
+  if (!obc->get_snaptrimmer_write()) {
+    dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
+    return NULL;
+  }
+
+  hobject_t snapoid(
+    coid.oid, coid.get_key(),
+    obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash,
+    info.pgid.pool(), coid.get_namespace());
+  ObjectContextRef snapset_obc = get_object_context(snapoid, false);
+
+  if (!snapset_obc->get_snaptrimmer_write()) {
+    dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
+    list<OpRequestRef> to_wake;
+    bool requeue_recovery = false;
+    bool requeue_snaptrimmer = false;
+    obc->put_write(&to_wake, &requeue_recovery, &requeue_snaptrimmer);
+    assert(to_wake.empty());
+    assert(!requeue_recovery);
+    return NULL;
+  }
 
   object_info_t &coi = obc->obs.oi;
   set<snapid_t> old_snaps(coi.snaps.begin(), coi.snaps.end());
   assert(old_snaps.size());
 
-  // get snap set context
-  if (!obc->ssc)
-    obc->ssc = get_snapset_context(
-      coid,
-      false);
-
-  assert(obc->ssc);
   SnapSet& snapset = obc->ssc->snapset;
 
   dout(10) << coid << " old_snaps " << old_snaps
@@ -2248,6 +2302,9 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
 
   RepGather *repop = simple_repop_create(obc);
   OpContext *ctx = repop->ctx;
+  ctx->snapset_obc = snapset_obc;
+  ctx->lock_to_release = OpContext::W_LOCK;
+  ctx->release_snapset_obc = true;
   ctx->at_version = get_next_version();
 
   PGBackend::PGTransaction *t = ctx->op_t;
@@ -2272,30 +2329,34 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
 	break;
     assert(p != snapset.clones.end());
     object_stat_sum_t delta;
+    delta.num_bytes -= snapset.get_clone_bytes(last);
+
     if (p != snapset.clones.begin()) {
       // not the oldest... merge overlap into next older clone
       vector<snapid_t>::iterator n = p - 1;
-      interval_set<uint64_t> keep;
-      keep.union_of(
-	snapset.clone_overlap[*n],
-	snapset.clone_overlap[*p]);
-      add_interval_usage(keep, delta);  // not deallocated
+      hobject_t prev_coid = coid;
+      prev_coid.snap = *n;
+      bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+      if (adjust_prev_bytes)
+	delta.num_bytes -= snapset.get_clone_bytes(*n);
+
       snapset.clone_overlap[*n].intersection_of(
 	snapset.clone_overlap[*p]);
-    } else {
-      add_interval_usage(
-	snapset.clone_overlap[last],
-	delta);  // not deallocated
+
+      if (adjust_prev_bytes)
+	delta.num_bytes += snapset.get_clone_bytes(*n);
     }
     delta.num_objects--;
     if (coi.is_dirty())
       delta.num_objects_dirty--;
+    if (coi.is_omap())
+      delta.num_objects_omap--;
     if (coi.is_whiteout()) {
       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
       delta.num_whiteouts--;
     }
     delta.num_object_clones--;
-    delta.num_bytes -= snapset.clone_size[last];
     info.stats.stats.add(delta, obc->obs.oi.category);
     obc->obs.exists = false;
 
@@ -2369,12 +2430,6 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
   // save head snapset
   dout(10) << coid << " new snapset " << snapset << dendl;
 
-  hobject_t snapoid(
-    coid.oid, coid.get_key(),
-    snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash,
-    info.pgid.pool(), coid.get_namespace());
-  ctx->snapset_obc = get_object_context(snapoid, false);
-
   if (snapset.clones.empty() && !snapset.head_exists) {
     dout(10) << coid << " removing " << snapoid << dendl;
     ctx->log.push_back(
@@ -3196,7 +3251,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  break;
 	}
 	if (oi.is_dirty()) {
-	  result = start_flush(ctx, false);
+	  result = start_flush(ctx, false, NULL);
 	} else {
 	  result = 0;
 	}
@@ -3219,11 +3274,20 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = 0;
 	  break;
 	}
+	hobject_t missing;
 	if (oi.is_dirty()) {
-	  result = start_flush(ctx, true);
+	  result = start_flush(ctx, true, &missing);
 	} else {
 	  result = 0;
 	}
+	// Check special return value which has set missing_return
+        if (result == -ENOENT) {
+          dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
+	  assert(!missing.is_min());
+	  wait_for_unreadable_object(missing, ctx->op);
+	  // Error code which is used elsewhere when wait_for_unreadable_object() is used
+	  result = -EAGAIN;
+	}
       }
       break;
 
@@ -3966,7 +4030,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
       // -- fancy writers --
     case CEPH_OSD_OP_APPEND:
-      ++ctx->num_write;
       {
 	// just do it inline; this works because we are happy to execute
 	// fancy op on replicas as well.
@@ -3977,7 +4040,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	newop.op.extent.length = op.extent.length;
 	newop.op.extent.truncate_seq = oi.truncate_seq;
         newop.indata = osd_op.indata;
-	do_osd_ops(ctx, nops);
+	result = do_osd_ops(ctx, nops);
 	osd_op.outdata.claim(newop.outdata);
       }
       break;
@@ -4466,12 +4529,19 @@ int ReplicatedPG::_verify_no_head_clones(const hobject_t& soid,
        ++p) {
     hobject_t clone_oid = soid;
     clone_oid.snap = *p;
+    if (is_missing_object(clone_oid))
+      return -EBUSY;
     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
     if (clone_obc && clone_obc->obs.exists) {
       dout(10) << __func__ << " cannot evict head before clone "
 	       << clone_oid << dendl;
       return -EBUSY;
     }
+    if (copy_ops.count(clone_oid)) {
+      dout(10) << __func__ << " cannot evict head, pending promote on clone "
+	       << clone_oid << dendl;
+      return -EBUSY;
+    }
   }
   return 0;
 }
@@ -4507,7 +4577,12 @@ inline int ReplicatedPG::_delete_oid(OpContext *ctx, bool no_whiteout)
   }
 
   ctx->delta_stats.num_wr++;
-  ctx->delta_stats.num_bytes -= oi.size;
+  if (soid.is_snap()) {
+    assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
+    ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
+  } else {
+    ctx->delta_stats.num_bytes -= oi.size;
+  }
   oi.size = 0;
 
   // cache: writeback: set whiteout on delete?
@@ -4521,6 +4596,8 @@ inline int ReplicatedPG::_delete_oid(OpContext *ctx, bool no_whiteout)
   }
 
   ctx->delta_stats.num_objects--;
+  if (soid.is_snap())
+    ctx->delta_stats.num_object_clones--;
   if (oi.is_whiteout()) {
     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
     ctx->delta_stats.num_whiteouts--;
@@ -4547,11 +4624,9 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
   int ret = find_object_context(
     hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(),
 	      soid.get_namespace()),
-    &rollback_to, false, &missing_oid);
+    &rollback_to, false, false, &missing_oid);
   if (ret == -EAGAIN) {
-    /* a different problem, like degraded pool
-     * with not-yet-restored object. We shouldn't have been able
-     * to get here; recovery should have completed first! */
+    /* clone must be missing */
     assert(is_missing_object(missing_oid));
     dout(20) << "_rollback_to attempted to roll back to a missing object "
 	     << missing_oid << " (requested snapid: ) " << snapid << dendl;
@@ -4694,6 +4769,19 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
     }
   }
 
+  if ((ctx->new_obs.exists &&
+       ctx->new_obs.oi.is_omap()) &&
+      (!ctx->obc->obs.exists ||
+       !ctx->obc->obs.oi.is_omap())) {
+    ++ctx->delta_stats.num_objects_omap;
+  }
+  if ((!ctx->new_obs.exists ||
+       !ctx->new_obs.oi.is_omap()) &&
+      (ctx->obc->obs.exists &&
+       ctx->obc->obs.oi.is_omap())) {
+    --ctx->delta_stats.num_objects_omap;
+  }
+
   // use newer snapc?
   if (ctx->new_snapset.seq > snapc.seq) {
     snapc.seq = ctx->new_snapset.seq;
@@ -4702,9 +4790,9 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
   }
 
   if (ctx->obs->exists)
-    filter_snapc(snapc);
+    filter_snapc(snapc.snaps);
   
-  if (ctx->obs->exists &&               // head exist(ed)
+  if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
       snapc.snaps.size() &&                 // there are snaps
       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
     // clone
@@ -4751,6 +4839,8 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
       dout(20) << __func__ << " cloning whiteout on " << soid << " to " << coid << dendl;
       ctx->delta_stats.num_whiteouts++;
     }
+    if (snap_oi->is_omap())
+      ctx->delta_stats.num_objects_omap++;
     ctx->delta_stats.num_object_clones++;
     ctx->new_snapset.clones.push_back(coid.snap);
     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
@@ -4777,11 +4867,17 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
 
   // update most recent clone_overlap and usage stats
   if (ctx->new_snapset.clones.size() > 0) {
-    interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
-    ctx->modified_ranges.intersection_of(newest_overlap);
-    // modified_ranges is still in use by the clone
-    add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
-    newest_overlap.subtract(ctx->modified_ranges);
+    /* we need to check whether the most recent clone exists, if it's been evicted,
+     * it's not included in the stats */
+    hobject_t last_clone_oid = soid;
+    last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
+    if (is_present_clone(last_clone_oid)) {
+      interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
+      ctx->modified_ranges.intersection_of(newest_overlap);
+      // modified_ranges is still in use by the clone
+      add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
+      newest_overlap.subtract(ctx->modified_ranges);
+    }
   }
   
   // prepend transaction to op_t
@@ -4978,7 +5074,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
   return result;
 }
 
-void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type)
+void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
 {
   const hobject_t& soid = ctx->obs->oi.soid;
   dout(20) << __func__ << " " << soid << " " << ctx
@@ -4988,7 +5084,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type)
   // snapset
   bufferlist bss;
 
-  if (soid.snap == CEPH_NOSNAP) {
+  if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
     ::encode(ctx->new_snapset, bss);
     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists);
 
@@ -5139,7 +5235,15 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type)
 
   // apply new object state.
   ctx->obc->obs = ctx->new_obs;
-  ctx->obc->ssc->snapset = ctx->new_snapset;
+
+  if (!maintain_ssc && soid.is_head()) {
+    ctx->obc->ssc->exists = false;
+    ctx->obc->ssc->snapset = SnapSet();
+  } else {
+    ctx->obc->ssc->exists = true;
+    ctx->obc->ssc->snapset = ctx->new_snapset;
+  }
+
   info.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
 
   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
@@ -5410,6 +5514,8 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
+    flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
 
   C_GatherBuilder gather(g_ceph_context);
 
@@ -5475,6 +5581,9 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
     return;
   }
 
+  if (cop->omap.size())
+    cop->results.has_omap = true;
+
   if (r >= 0 && pool.info.require_rollback() && cop->omap.size()) {
     r = -EOPNOTSUPP;
   }
@@ -5487,7 +5596,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
 
   assert(cop->rval >= 0);
 
-  if (oid.snap < CEPH_NOSNAP) {
+  if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
     // verify snap hasn't been deleted
     vector<snapid_t>::iterator p = cop->results.snaps.begin();
     while (p != cop->results.snaps.end()) {
@@ -5671,6 +5780,14 @@ void ReplicatedPG::finish_copyfrom(OpContext *ctx)
     --ctx->delta_stats.num_whiteouts;
   }
 
+  if (cb->results->has_omap) {
+    dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
+    obs.oi.set_flag(object_info_t::FLAG_OMAP);
+  } else {
+    dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
+    obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+  }
+
   interval_set<uint64_t> ch;
   if (obs.oi.size > 0)
     ch.insert(0, obs.oi.size);
@@ -5707,6 +5824,42 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
     results->started_temp_obj = false;
   }
 
+  if (r == -ENOENT && soid.is_snap()) {
+    dout(10) << __func__
+	     << ": enoent while trying to promote clone, " << soid
+	     << " must have been trimmed, removing from snapset"
+	     << dendl;
+    hobject_t head(soid.get_head());
+    ObjectContextRef obc = get_object_context(head, false);
+    assert(obc);
+    RepGather *repop = simple_repop_create(obc);
+    OpContext *tctx = repop->ctx;
+    tctx->at_version = get_next_version();
+    filter_snapc(tctx->new_snapset.snaps);
+    vector<snapid_t> new_clones(tctx->new_snapset.clones.size());
+    for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
+	 i != tctx->new_snapset.clones.end();
+	 ++i) {
+      if (*i != soid.snap)
+	new_clones.push_back(*i);
+    }
+    tctx->new_snapset.clones.swap(new_clones);
+    tctx->new_snapset.clone_overlap.erase(soid.snap);
+    tctx->new_snapset.clone_size.erase(soid.snap);
+
+    // take RWWRITE lock for duration of our local write.  ignore starvation.
+    if (!obc->rwstate.take_write_lock()) {
+      assert(0 == "problem!");
+    }
+    tctx->lock_to_release = OpContext::W_LOCK;
+    dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
+
+    finish_ctx(tctx, pg_log_entry_t::PROMOTE);
+
+    simple_repop_submit(repop);
+    return;
+  }
+
   bool whiteout = false;
   if (r == -ENOENT &&
       soid.snap == CEPH_NOSNAP &&
@@ -5747,6 +5900,12 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
     osd->logger->inc(l_osd_tier_whiteout);
   } else {
+    if (results->has_omap) {
+      dout(10) << __func__ << " setting omap flag on " << soid << dendl;
+      tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
+      ++tctx->delta_stats.num_objects_omap;
+    }
+
     tctx->op_t->append(results->final_tx);
     delete results->final_tx;
     results->final_tx = NULL;
@@ -5754,7 +5913,6 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
       tctx->discard_temp_oid = results->temp_oid;
     }
     tctx->new_obs.oi.size = results->object_size;
-    tctx->delta_stats.num_bytes += results->object_size;
     tctx->new_obs.oi.category = results->category;
     tctx->new_obs.oi.user_version = results->user_version;
 
@@ -5771,7 +5929,7 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
 	vector<snapid_t>::iterator p = snapset.snaps.begin();
 	while (p != snapset.snaps.end() && *p > soid.snap)
 	  ++p;
-	assert(p != snapset.snaps.end() && *p == soid.snap);
+	assert(p != snapset.snaps.end());
 	do {
 	  tctx->new_obs.oi.snaps.push_back(*p);
 	  ++p;
@@ -5779,6 +5937,14 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
       }
       dout(20) << __func__ << " snaps " << tctx->new_obs.oi.snaps << dendl;
       assert(!tctx->new_obs.oi.snaps.empty());
+      assert(obc->ssc->snapset.clone_size.count(soid.snap));
+      assert(obc->ssc->snapset.clone_size[soid.snap] ==
+	     results->object_size);
+      assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
+
+      tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
+    } else {
+      tctx->delta_stats.num_bytes += results->object_size;
     }
   }
 
@@ -5887,7 +6053,7 @@ struct C_Flush : public Context {
   }
 };
 
-int ReplicatedPG::start_flush(OpContext *ctx, bool blocking)
+int ReplicatedPG::start_flush(OpContext *ctx, bool blocking, hobject_t *pmissing)
 {
   const object_info_t& oi = ctx->obc->obs.oi;
   const hobject_t& soid = oi.soid;
@@ -5908,6 +6074,12 @@ int ReplicatedPG::start_flush(OpContext *ctx, bool blocking)
       hobject_t next = soid;
       next.snap = *p;
       assert(next.snap < soid.snap);
+      if (pg_log.get_missing().is_missing(next)) {
+	dout(10) << __func__ << " missing clone is " << next << dendl;
+	if (pmissing)
+	  *pmissing = next;
+	return -ENOENT;
+      }
       ObjectContextRef older_obc = get_object_context(next, false);
       if (older_obc) {
 	dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
@@ -5961,6 +6133,66 @@ int ReplicatedPG::start_flush(OpContext *ctx, bool blocking)
     cancel_flush(fop, false);
   }
 
+  // construct a SnapContext appropriate for this clone/head
+  SnapContext dsnapc;
+  SnapContext snapc;
+  if (soid.snap == CEPH_NOSNAP) {
+    snapc.seq = snapset.seq;
+    snapc.snaps = snapset.snaps;
+
+    if (!snapset.clones.empty() && snapset.clones.back() != snapset.seq) {
+      dsnapc.seq = snapset.clones.back();
+      vector<snapid_t>::iterator p = snapset.snaps.begin();
+      while (p != snapset.snaps.end() && *p > dsnapc.seq)
+	++p;
+      dsnapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+    }
+  } else {
+    vector<snapid_t>::iterator citer = std::find(
+      snapset.clones.begin(),
+      snapset.clones.end(),
+      soid.snap);
+    assert(citer != snapset.clones.end());
+    snapid_t prev_snapc = (citer == snapset.clones.begin()) ?
+      snapid_t(0) : *(citer - 1);
+
+    vector<snapid_t>::iterator p = snapset.snaps.begin();
+    while (p != snapset.snaps.end() && *p >= oi.snaps.back())
+      ++p;
+    snapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+
+    // we may need to send a delete first
+    while (p != snapset.snaps.end() && *p > prev_snapc)
+      ++p;
+    dsnapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+
+    if (dsnapc.snaps.empty()) {
+      snapc.seq = prev_snapc;
+    } else {
+      dsnapc.seq = prev_snapc;
+      snapc.seq = oi.snaps.back() - 1;
+    }
+  }
+
+  object_locator_t base_oloc(soid);
+  base_oloc.pool = pool.info.tier_of;
+
+  if (!dsnapc.snaps.empty()) {
+    ObjectOperation o;
+    o.remove();
+    osd->objecter_lock.Lock();
+    osd->objecter->mutate(
+      soid.oid,
+      base_oloc,
+      o,
+      dsnapc,
+      oi.mtime,
+      CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ORDERSNAP,
+      NULL,
+      NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
+    osd->objecter_lock.Unlock();
+  }
+
   FlushOpRef fop(new FlushOp);
   fop->ctx = ctx;
   fop->flushed_version = oi.user_version;
@@ -5975,27 +6207,10 @@ int ReplicatedPG::start_flush(OpContext *ctx, bool blocking)
     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
 		CEPH_OSD_COPY_FROM_FLAG_FLUSH |
 		CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
-		CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE);
+		CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+		CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE);
   }
   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
-  object_locator_t base_oloc(soid);
-  base_oloc.pool = pool.info.tier_of;
-
-  // construct a SnapContext appropriate for this clone/head
-  SnapContext snapc;
-  if (soid.snap == CEPH_NOSNAP) {
-    snapc.seq = snapset.seq;
-    snapc.snaps = snapset.snaps;
-  } else {
-    // we want to only include snaps that are older than the oldest
-    // snap for which we are defined, so that the object appears to
-    // have been written before that.
-    vector<snapid_t>::iterator p = snapset.snaps.begin();
-    while (p != snapset.snaps.end() && *p >= oi.snaps.back())
-      ++p;
-    snapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
-    snapc.seq = oi.snaps.back() - 1;
-  }
 
   osd->objecter_lock.Lock();
   ceph_tid_t tid = osd->objecter->mutate(soid.oid, base_oloc, o, snapc, oi.mtime,
@@ -6149,8 +6364,9 @@ void ReplicatedPG::cancel_flush(FlushOpRef fop, bool requeue)
     Mutex::Locker l(osd->objecter_lock);
     osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
   }
-  if (fop->ctx->op && requeue) {
-    requeue_op(fop->ctx->op);
+  if (requeue) {
+    if (fop->ctx->op)
+      requeue_op(fop->ctx->op);
     requeue_ops(fop->dup_ops);
   }
   if (fop->blocking) {
@@ -6170,6 +6386,16 @@ void ReplicatedPG::cancel_flush_ops(bool requeue)
   }
 }
 
+bool ReplicatedPG::is_present_clone(hobject_t coid)
+{
+  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
+    return true;
+  if (is_missing_object(coid))
+    return true;
+  ObjectContextRef obc = get_object_context(coid, false);
+  return obc && obc->obs.exists;
+}
+
 // ========================================================================
 // rep op gather
 
@@ -6456,6 +6682,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
     repop->ctx->op_t,
     pg_trim_to,
     repop->ctx->log,
+    repop->ctx->updated_hset_history,
     onapplied_sync,
     on_all_applied,
     on_all_commit,
@@ -6474,6 +6701,7 @@ void ReplicatedBackend::issue_op(
   hobject_t new_temp_oid,
   hobject_t discard_temp_oid,
   vector<pg_log_entry_t> &log_entries,
+  boost::optional<pg_hit_set_history_t> &hset_hist,
   InProgressOp *op,
   ObjectStore::Transaction *op_t)
 {
@@ -6528,6 +6756,7 @@ void ReplicatedBackend::issue_op(
 
     wr->new_temp_oid = new_temp_oid;
     wr->discard_temp_oid = discard_temp_oid;
+    wr->updated_hit_set_history = hset_hist;
 
     get_parent()->send_message_osd_cluster(
       peer.osd, wr, get_osdmap()->get_epoch());
@@ -6782,9 +7011,8 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
       pg_log_entry_t::LOST_REVERT));
   ObjectContextRef obc = object_contexts.lookup(soid);
   if (obc) {
-    dout(10) << "get_object_context " << obc << " " << soid
-	     << " " << obc->rwstate
-	     << " oi:" << obc->obs.oi << dendl;
+    dout(10) << __func__ << ": found obc in cache: " << obc
+	     << dendl;
   } else {
     // check disk
     bufferlist bv;
@@ -6794,15 +7022,28 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
     } else {
       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
       if (r < 0) {
-	if (!can_create)
+	if (!can_create) {
+	  dout(10) << __func__ << ": no obc for soid "
+		   << soid << " and !can_create"
+		   << dendl;
 	  return ObjectContextRef();   // -ENOENT!
+	}
 
+	dout(10) << __func__ << ": no obc for soid "
+		 << soid << " but can_create"
+		 << dendl;
 	// new object.
 	object_info_t oi(soid);
 	SnapSetContext *ssc = get_snapset_context(
 	  soid, true,
 	  soid.has_snapset() ? attrs : 0);
-	return create_object_context(oi, ssc);
+	obc = create_object_context(oi, ssc);
+	dout(10) << __func__ << ": " << obc << " " << soid
+		 << " " << obc->rwstate
+		 << " oi: " << obc->obs.oi
+		 << " ssc: " << obc->ssc
+		 << " snapset: " << obc->ssc->snapset << dendl;
+	return obc;
       }
     }
 
@@ -6833,11 +7074,15 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
       }
     }
 
-    dout(10) << "get_object_context " << obc << " " << soid
-	     << " " << obc->rwstate
-	     << " oi:" << obc->obs.oi
-	     << " 0 -> 1 read " << obc->obs.oi << dendl;
+    dout(10) << __func__ << ": creating obc from disk: " << obc
+	     << dendl;
   }
+  assert(obc->ssc);
+  dout(10) << __func__ << ": " << obc << " " << soid
+	   << " " << obc->rwstate
+	   << " oi: " << obc->obs.oi
+	   << " ssc: " << obc->ssc
+	   << " snapset: " << obc->ssc->snapset << dendl;
   return obc;
 }
 
@@ -6871,6 +7116,7 @@ void ReplicatedPG::context_registry_on_change()
 int ReplicatedPG::find_object_context(const hobject_t& oid,
 				      ObjectContextRef *pobc,
 				      bool can_create,
+				      bool map_snapid_to_clone,
 				      hobject_t *pmissing)
 {
   hobject_t head(oid.oid, oid.get_key(), CEPH_NOSNAP, oid.hash,
@@ -6924,19 +7170,81 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
   }
 
   // we want a snap
-  if (pool.info.is_removed_snap(oid.snap)) {
+  if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
     return -ENOENT;
   }
 
   SnapSetContext *ssc = get_snapset_context(oid, can_create);
-  if (!ssc) {
+  if (!ssc || !(ssc->exists)) {
     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
     if (pmissing)
       *pmissing = head;  // start by getting the head
+    if (ssc)
+      put_snapset_context(ssc);
     return -ENOENT;
   }
 
+  if (map_snapid_to_clone) {
+    dout(10) << "find_object_context " << oid << " @" << oid.snap
+	     << " snapset " << ssc->snapset
+	     << " map_snapid_to_clone=true" << dendl;
+    if (oid.snap > ssc->snapset.seq) {
+      // already must be readable
+      ObjectContextRef obc = get_object_context(head, false);
+      dout(10) << "find_object_context " << oid << " @" << oid.snap
+	       << " snapset " << ssc->snapset
+	       << " maps to head" << dendl;
+      *pobc = obc;
+      put_snapset_context(ssc);
+      return (obc && obc->obs.exists) ? 0 : -ENOENT;
+    } else {
+      vector<snapid_t>::const_iterator citer = std::find(
+	ssc->snapset.clones.begin(),
+	ssc->snapset.clones.end(),
+	oid.snap);
+      if (citer == ssc->snapset.clones.end()) {
+	dout(10) << "find_object_context " << oid << " @" << oid.snap
+		 << " snapset " << ssc->snapset
+		 << " maps to nothing" << dendl;
+	put_snapset_context(ssc);
+	return -ENOENT;
+      }
+
+      dout(10) << "find_object_context " << oid << " @" << oid.snap
+	       << " snapset " << ssc->snapset
+	       << " maps to " << oid << dendl;
+
+      if (pg_log.get_missing().is_missing(oid)) {
+	dout(10) << "find_object_context " << oid << " @" << oid.snap
+		 << " snapset " << ssc->snapset
+		 << " " << oid << " is missing" << dendl;
+	if (pmissing)
+	  *pmissing = oid;
+	put_snapset_context(ssc);
+	return -EAGAIN;
+      }
+
+      ObjectContextRef obc = get_object_context(oid, false);
+      if (!obc || !obc->obs.exists) {
+	dout(10) << "find_object_context " << oid << " @" << oid.snap
+		 << " snapset " << ssc->snapset
+		 << " " << oid << " is not present" << dendl;
+	if (pmissing)
+	  *pmissing = oid;
+	put_snapset_context(ssc);
+	return -ENOENT;
+      }
+      dout(10) << "find_object_context " << oid << " @" << oid.snap
+	       << " snapset " << ssc->snapset
+	       << " " << oid << " HIT" << dendl;
+      *pobc = obc;
+      put_snapset_context(ssc);
+      return 0;
+    }
+    assert(0); //unreachable
+  }
+
   dout(10) << "find_object_context " << oid << " @" << oid.snap
 	   << " snapset " << ssc->snapset << dendl;
  
@@ -6989,7 +7297,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
   }
 
   ObjectContextRef obc = get_object_context(soid, false);
-  if (!obc) {
+  if (!obc || !obc->obs.exists) {
     dout(20) << __func__ << " missing clone " << soid << dendl;
     if (pmissing)
       *pmissing = soid;
@@ -7043,6 +7351,8 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t
     stat.num_objects_dirty++;
   if (oi.is_whiteout())
     stat.num_whiteouts++;
+  if (oi.is_omap())
+    stat.num_objects_omap++;
 
   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
     stat.num_object_clones++;
@@ -7105,7 +7415,12 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
     oid.get_snapdir());
   if (p != snapset_contexts.end()) {
-    ssc = p->second;
+    if (can_create || p->second->exists) {
+      ssc = p->second;
+      ssc->exists = true;
+    } else {
+      return NULL;
+    }
   } else {
     bufferlist bv;
     if (!attrs) {
@@ -7233,6 +7548,7 @@ void ReplicatedBackend::sub_op_modify(OpRequestRef op)
     parent->update_stats(m->pg_stats);
     parent->log_operation(
       log,
+      m->updated_hit_set_history,
       m->pg_trim_to,
       update_snaps,
       &(rm->localt));
@@ -7496,7 +7812,7 @@ void ReplicatedBackend::prepare_pull(
   pg_shard_t fromshard = *p;
 
   dout(7) << "pull " << soid
-	  << "v " << v
+	  << " v " << v
 	  << " on osds " << *p
 	  << " from osd." << fromshard
 	  << dendl;
@@ -8947,8 +9263,7 @@ void ReplicatedPG::on_shutdown()
   osd->remote_reserver.cancel_reservation(info.pgid);
   osd->local_reserver.cancel_reservation(info.pgid);
 
-  if (is_primary())
-    clear_primary_state(false);  // Not staying primary
+  clear_primary_state();
   cancel_recovery();
 }
 
@@ -9622,7 +9937,7 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       }
 
       if (recovering.count(soid)) {
-	dout(10) << __func__ << ": already recovering" << soid << dendl;
+	dout(10) << __func__ << ": already recovering " << soid << dendl;
 	continue;
       }
 
@@ -9631,6 +9946,18 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
 	continue;
       }
 
+      if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
+	dout(10) << __func__ << ": " << soid.get_head()
+		 << " still missing on primary" << dendl;
+	continue;
+      }
+
+      if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
+	dout(10) << __func__ << ": " << soid.get_snapdir()
+		 << " still missing on primary" << dendl;
+	continue;
+      }
+
       if (pg_log.get_missing().is_missing(soid)) {
 	dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
 	continue;
@@ -10235,7 +10562,7 @@ void ReplicatedPG::check_local()
 hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
 {
   ostringstream ss;
-  ss << "hit_set_" << info.pgid << "_current_" << stamp;
+  ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
 		 info.pgid.ps(), info.pgid.pool(),
 		 cct->_conf->osd_hit_set_namespace);
@@ -10246,7 +10573,7 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
 hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
 {
   ostringstream ss;
-  ss << "hit_set_" << info.pgid << "_archive_" << start << "_" << end;
+  ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
 		 info.pgid.ps(), info.pgid.pool(),
 		 cct->_conf->osd_hit_set_namespace);
@@ -10264,7 +10591,8 @@ void ReplicatedPG::hit_set_clear()
 
 void ReplicatedPG::hit_set_setup()
 {
-  if (!is_primary() ||
+  if (!is_active() ||
+      !is_primary() ||
       !pool.info.hit_set_count ||
       !pool.info.hit_set_period ||
       pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
@@ -10376,15 +10704,16 @@ void ReplicatedPG::hit_set_persist()
 
   // If any archives are degraded we skip this persist request
   // account for the additional entry being added below
-  for (unsigned num = info.hit_set.history.size() + 1; num > max; --num) {
-    list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
-    assert(p != info.hit_set.history.end());
+  for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
+       p != info.hit_set.history.end();
+       ++p) {
     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
 
     // Once we hit a degraded object just skip further trim
     if (is_degraded_object(aoid))
       return;
   }
+
   oid = get_hit_set_archive_object(start, now);
   // If the current object is degraded we skip this persist request
   if (is_degraded_object(oid))
@@ -10431,18 +10760,20 @@ void ReplicatedPG::hit_set_persist()
     repop->on_applied = new C_HitSetFlushing(this, flush_time);
   OpContext *ctx = repop->ctx;
   ctx->at_version = get_next_version();
+  ctx->updated_hset_history = info.hit_set;
+  pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
 
-  if (info.hit_set.current_last_stamp != utime_t()) {
+  if (updated_hit_set_hist.current_last_stamp != utime_t()) {
     // FIXME: we cheat slightly here by bundling in a remove on a object
     // other the RepGather object.  we aren't carrying an ObjectContext for
     // the deleted object over this period.
     hobject_t old_obj =
-      get_hit_set_current_object(info.hit_set.current_last_stamp);
+      get_hit_set_current_object(updated_hit_set_hist.current_last_stamp);
     ctx->log.push_back(
       pg_log_entry_t(pg_log_entry_t::DELETE,
 		     old_obj,
 		     ctx->at_version,
-		     info.hit_set.current_last_update,
+		     updated_hit_set_hist.current_last_update,
 		     0,
 		     osd_reqid_t(),
 		     ctx->mtime));
@@ -10468,13 +10799,13 @@ void ReplicatedPG::hit_set_persist()
     ctx->delta_stats.num_bytes -= st.st_size;
   }
 
-  info.hit_set.current_last_update = info.last_update; // *after* above remove!
-  info.hit_set.current_info.version = ctx->at_version;
+  updated_hit_set_hist.current_last_update = info.last_update; // *after* above remove!
+  updated_hit_set_hist.current_info.version = ctx->at_version;
 
-  info.hit_set.history.push_back(info.hit_set.current_info);
+  updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
   hit_set_create();
-  info.hit_set.current_info = pg_hit_set_info_t();
-  info.hit_set.current_last_stamp = utime_t();
+  updated_hit_set_hist.current_info = pg_hit_set_info_t();
+  updated_hit_set_hist.current_last_stamp = utime_t();
 
   // fabricate an object_info_t and SnapSet
   obc->obs.oi.version = ctx->at_version;
@@ -10486,6 +10817,7 @@ void ReplicatedPG::hit_set_persist()
   ctx->new_snapset.head_exists = true;
 
   ctx->delta_stats.num_objects++;
+  ctx->delta_stats.num_objects_hit_set_archive++;
   ctx->delta_stats.num_bytes += bl.length();
 
   bufferlist bss;
@@ -10521,9 +10853,12 @@ void ReplicatedPG::hit_set_persist()
 
 void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
 {
-  for (unsigned num = info.hit_set.history.size(); num > max; --num) {
-    list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
-    assert(p != info.hit_set.history.end());
+  assert(repop->ctx->updated_hset_history);
+  pg_hit_set_history_t &updated_hit_set_hist =
+    *(repop->ctx->updated_hset_history);
+  for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
+    list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
+    assert(p != updated_hit_set_hist.history.end());
     hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
 
     assert(!is_degraded_object(oid));
@@ -10551,7 +10886,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
     }
     if (agent_state)
       agent_state->remove_oldest_hit_set();
-    info.hit_set.history.pop_front();
+    updated_hit_set_hist.history.pop_front();
 
     struct stat st;
     int r = osd->store->stat(
@@ -10560,6 +10895,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
       &st);
     assert(r == 0);
     --repop->ctx->delta_stats.num_objects;
+    --repop->ctx->delta_stats.num_objects_hit_set_archive;
     repop->ctx->delta_stats.num_bytes -= st.st_size;
   }
 }
@@ -10571,7 +10907,8 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
 void ReplicatedPG::agent_setup()
 {
   assert(is_locked());
-  if (!is_primary() ||
+  if (!is_active() ||
+      !is_primary() ||
       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
       pool.info.tier_of < 0 ||
       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
@@ -10594,6 +10931,10 @@ void ReplicatedPG::agent_setup()
     dout(10) << __func__ << " keeping existing state" << dendl;
   }
 
+  if (info.stats.stats_invalid) {
+    osd->clog.warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
+  }
+
   agent_choose_mode();
 }
 
@@ -10655,11 +10996,21 @@ void ReplicatedPG::agent_work(int start_max)
   for (vector<hobject_t>::iterator p = ls.begin();
        p != ls.end();
        ++p) {
+    if (p->nspace == cct->_conf->osd_hit_set_namespace) {
+      dout(20) << __func__ << " skip (hit set) " << *p << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
     if (is_degraded_object(*p)) {
       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
       osd->logger->inc(l_osd_agent_skip);
       continue;
     }
+    if (is_missing_object(p->get_head())) {
+      dout(20) << __func__ << " skip (missing head) " << *p << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
     ObjectContextRef obc = get_object_context(*p, false, NULL);
     if (!obc) {
       // we didn't flush; we may miss something here.
@@ -10677,11 +11028,6 @@ void ReplicatedPG::agent_work(int start_max)
       osd->logger->inc(l_osd_agent_skip);
       continue;
     }
-    if (obc->obs.oi.soid.nspace == cct->_conf->osd_hit_set_namespace) {
-      dout(20) << __func__ << " skip (hit set) " << obc->obs.oi << dendl;
-      osd->logger->inc(l_osd_agent_skip);
-      continue;
-    }
     if (obc->is_blocked()) {
       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
       osd->logger->inc(l_osd_agent_skip);
@@ -10731,9 +11077,8 @@ void ReplicatedPG::agent_load_hit_sets()
 
   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
     dout(10) << __func__ << dendl;
-    for (list<pg_hit_set_info_t>::reverse_iterator p =
-	   info.hit_set.history.rbegin();
-	 p != info.hit_set.history.rend(); ++p) {
+    for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
+	 p != info.hit_set.history.end(); ++p) {
       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
 	dout(10) << __func__ << " loading " << p->begin << "-"
 		 << p->end << dendl;
@@ -10746,16 +11091,32 @@ void ReplicatedPG::agent_load_hit_sets()
 	// check if it's still in flight
 	if (hit_set_flushing.count(p->begin)) {
 	  agent_state->add_hit_set(p->begin.sec(), hit_set_flushing[p->begin]);
-	} else {
-	  bufferlist bl;
-	  hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+	  continue;
+	}
+
+	hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+	if (is_unreadable_object(oid)) {
+	  dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
+	  break;
+	}
+
+	ObjectContextRef obc = get_object_context(oid, false);
+	if (!obc) {
+	  derr << __func__ << ": could not load hitset " << oid << dendl;
+	  break;
+	}
+
+	bufferlist bl;
+	{
+	  obc->ondisk_read_lock();
 	  int r = osd->store->read(coll, oid, 0, 0, bl);
 	  assert(r >= 0);
-	  HitSetRef hs(new HitSet);
-	  bufferlist::iterator pbl = bl.begin();
-	  ::decode(*hs, pbl);
-	  agent_state->add_hit_set(p->begin.sec(), hs);
+	  obc->ondisk_read_unlock();
 	}
+	HitSetRef hs(new HitSet);
+	bufferlist::iterator pbl = bl.begin();
+	::decode(*hs, pbl);
+	agent_state->add_hit_set(p->begin.sec(), hs);
       }
     }
   }
@@ -10776,17 +11137,20 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc)
 {
   if (!obc->obs.oi.is_dirty()) {
     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
     return false;
   }
 
   utime_t now = ceph_clock_now(NULL);
   if (obc->obs.oi.mtime + utime_t(pool.info.cache_min_flush_age, 0) > now) {
     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
     return false;
   }
 
   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
     return false;
   }
 
@@ -10806,7 +11170,15 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc)
   ctx->at_version = get_next_version();
   ctx->on_finish = new C_AgentFlushStartStop(this, obc->obs.oi.soid);
 
-  start_flush(ctx, false);
+  int result = start_flush(ctx, false, NULL);
+  if (result != -EINPROGRESS) {
+    dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
+      << " with " << result << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    if (result != -ECANCELED)
+      close_op_ctx(ctx, result);
+    return false;
+  }
 
   osd->logger->inc(l_osd_agent_flush);
   return true;
@@ -10884,8 +11256,10 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
   ctx->at_version = get_next_version();
   assert(ctx->new_obs.exists);
   int r = _delete_oid(ctx, true);
+  if (obc->obs.oi.is_omap())
+    ctx->delta_stats.num_objects_omap--;
   assert(r == 0);
-  finish_ctx(ctx, pg_log_entry_t::DELETE);
+  finish_ctx(ctx, pg_log_entry_t::DELETE, false);
   simple_repop_submit(repop);
   osd->logger->inc(l_osd_tier_evict);
   osd->logger->inc(l_osd_agent_evict);
@@ -10906,15 +11280,45 @@ void ReplicatedPG::agent_choose_mode()
 {
   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
 
-  // adjust (effective) user objects down based on the (max) number
+  uint64_t num_user_objects = info.stats.stats.sum.num_objects;
+
+  // adjust (effective) user objects down based on the number
   // of HitSet objects, which should not count toward our total since
   // they cannot be flushed.
-  uint64_t num_user_objects = info.stats.stats.sum.num_objects;
-  if (num_user_objects > pool.info.hit_set_count)
-    num_user_objects -= pool.info.hit_set_count;
+  uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
+
+  // also exclude omap objects if ec backing pool
+  const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
+  assert(base_pool);
+  if (base_pool->is_erasure())
+    unflushable += info.stats.stats.sum.num_objects_omap;
+
+
+  if (num_user_objects > unflushable)
+    num_user_objects -= unflushable;
   else
     num_user_objects = 0;
 
+  // also reduce the num_dirty by num_objects_omap
+  int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
+  if (base_pool->is_erasure()) {
+    if (num_dirty > info.stats.stats.sum.num_objects_omap)
+      num_dirty -= info.stats.stats.sum.num_objects_omap;
+    else
+      num_dirty = 0;
+  }
+
+  dout(10) << __func__ << ": "
+	   << " num_objects: " << info.stats.stats.sum.num_objects
+	   << " num_bytes: " << info.stats.stats.sum.num_bytes
+	   << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
+	   << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
+	   << " num_dirty: " << num_dirty
+	   << " num_user_objects: " << num_user_objects
+	   << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
+	   << " pool.info.target_max_objects: " << pool.info.target_max_objects
+	   << dendl;
+
   // get dirty, full ratios
   uint64_t dirty_micro = 0;
   uint64_t full_micro = 0;
@@ -10922,15 +11326,15 @@ void ReplicatedPG::agent_choose_mode()
     uint64_t avg_size = info.stats.stats.sum.num_bytes /
       info.stats.stats.sum.num_objects;
     dirty_micro =
-      info.stats.stats.sum.num_objects_dirty * avg_size * 1000000 /
+      num_dirty * avg_size * 1000000 /
       (pool.info.target_max_bytes / divisor);
     full_micro =
-      info.stats.stats.sum.num_bytes * 1000000 /
+      num_user_objects * avg_size * 1000000 /
       (pool.info.target_max_bytes / divisor);
   }
   if (pool.info.target_max_objects) {
     uint64_t dirty_objects_micro =
-      info.stats.stats.sum.num_objects_dirty * 1000000 /
+      num_dirty * 1000000 /
       (pool.info.target_max_objects / divisor);
     if (dirty_objects_micro > dirty_micro)
       dirty_micro = dirty_objects_micro;
@@ -10951,8 +11355,13 @@ void ReplicatedPG::agent_choose_mode()
     flush_target += flush_slop;
   else
     flush_target -= MIN(flush_target, flush_slop);
-  if (dirty_micro > flush_target)
+
+  if (info.stats.stats_invalid) {
+    // idle; stats can't be trusted until we scrub.
+    dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
+  } else if (dirty_micro > flush_target) {
     flush_mode = TierAgentState::FLUSH_MODE_ACTIVE;
+  }
 
   // evict mode
   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
@@ -10964,7 +11373,9 @@ void ReplicatedPG::agent_choose_mode()
   else
     evict_target -= MIN(evict_target, evict_slop);
 
-  if (full_micro > 1000000) {
+  if (info.stats.stats_invalid) {
+    // idle; stats can't be trusted until we scrub.
+  } else if (full_micro > 1000000) {
     // evict anything clean
     evict_mode = TierAgentState::EVICT_MODE_FULL;
     evict_effort = 1000000;
@@ -11092,6 +11503,9 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
     if (soid.snap != CEPH_SNAPDIR)
       stat.num_objects++;
 
+    if (soid.nspace == cct->_conf->osd_hit_set_namespace)
+      stat.num_objects_hit_set_archive++;
+
     // new snapset?
     if (soid.snap == CEPH_SNAPDIR ||
 	soid.snap == CEPH_NOSNAP) {
@@ -11123,17 +11537,6 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
 	next_clone = hobject_t();
 	dout(20) << "  snapset " << snapset << dendl;
       }
-
-      // subtract off any clone overlap
-      for (map<snapid_t,interval_set<uint64_t> >::iterator q = snapset.clone_overlap.begin();
-	   q != snapset.clone_overlap.end();
-	   ++q) {
-	for (interval_set<uint64_t>::const_iterator r = q->second.begin();
-	     r != q->second.end();
-	     ++r) {
-	  stat.num_bytes -= r.get_len();
-	}	  
-      }
     }
 
     // basic checks.
@@ -11159,13 +11562,19 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
 
     dout(20) << mode << "  " << soid << " " << oi << dendl;
 
-    stat.num_bytes += oi.size;
+    if (soid.is_snap()) {
+      stat.num_bytes += snapset.get_clone_bytes(soid.snap);
+    } else {
+      stat.num_bytes += oi.size;
+    }
 
     if (!soid.is_snapdir()) {
       if (oi.is_dirty())
 	++stat.num_objects_dirty;
       if (oi.is_whiteout())
 	++stat.num_whiteouts;
+      if (oi.is_omap())
+	++stat.num_objects_omap;
     }
 
     //bufferlist data;
@@ -11285,12 +11694,17 @@ void ReplicatedPG::_scrub_finish()
   if (info.stats.stats_invalid) {
     info.stats.stats = scrub_cstat;
     info.stats.stats_invalid = false;
+
+    if (agent_state)
+      agent_choose_mode();
   }
 
   dout(10) << mode << " got "
 	   << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
 	   << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
 	   << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
+	   << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
+	   << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
 	   << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes."
 	   << dendl;
 
@@ -11298,6 +11712,10 @@ void ReplicatedPG::_scrub_finish()
       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
        !info.stats.dirty_stats_invalid) ||
+      (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
+       !info.stats.omap_stats_invalid) ||
+      (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
+       !info.stats.hitset_stats_invalid) ||
       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
     osd->clog.error() << info.pgid << " " << mode
@@ -11305,6 +11723,8 @@ void ReplicatedPG::_scrub_finish()
 		      << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
 		      << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
 		      << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
+		      << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
+		      << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
 		      << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
 		      << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes.\n";
     ++scrubber.shallow_errors;
@@ -11313,6 +11733,8 @@ void ReplicatedPG::_scrub_finish()
       ++scrubber.fixed;
       info.stats.stats = scrub_cstat;
       info.stats.dirty_stats_invalid = false;
+      info.stats.omap_stats_invalid = false;
+      info.stats.hitset_stats_invalid = false;
       publish_stats_to_osd();
       share_pg_info();
     }
@@ -11418,6 +11840,7 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&)
   dout(10) << "TrimmingObjects: trimming snap " << snap_to_trim << dendl;
 
   // Get next
+  hobject_t old_pos = pos;
   int r = pg->snap_mapper.get_next_object_to_trim(snap_to_trim, &pos);
   if (r != 0 && r != -ENOENT) {
     derr << __func__ << ": get_next returned " << cpp_strerror(r) << dendl;
@@ -11431,6 +11854,12 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&)
 
   dout(10) << "TrimmingObjects react trimming " << pos << dendl;
   RepGather *repop = pg->trim_object(pos);
+  if (!repop) {
+    dout(10) << __func__ << " could not get write lock on obj "
+	     << pos << dendl;
+    pos = old_pos;
+    return discard_event();
+  }
   assert(repop);
   repop->queue_snap_trimmer = true;
 
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 4d6000d..38bdfbe 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -123,9 +123,11 @@ public:
     librados::snap_set_t snapset; ///< src snapset (if head)
     bool mirror_snapset;
     map<string, bufferlist> attrs; ///< src user attrs
+    bool has_omap;
     CopyResults() : object_size(0), started_temp_obj(false),
 		    final_tx(NULL), user_version(0), 
-		    should_requeue(false), mirror_snapset(false) {}
+		    should_requeue(false), mirror_snapset(false),
+		    has_omap(false) {}
   };
 
   struct CopyOp {
@@ -339,9 +341,14 @@ public:
   }
   void log_operation(
     vector<pg_log_entry_t> &logv,
+    boost::optional<pg_hit_set_history_t> &hset_history,
     const eversion_t &trim_to,
     bool transaction_applied,
     ObjectStore::Transaction *t) {
+    if (hset_history) {
+      info.hit_set = *hset_history;
+      dirty_info = true;
+    }
     append_log(logv, trim_to, *t, transaction_applied);
   }
 
@@ -451,6 +458,7 @@ public:
 
     PGBackend::PGTransaction *op_t;
     vector<pg_log_entry_t> log;
+    boost::optional<pg_hit_set_history_t> updated_hset_history;
 
     interval_set<uint64_t> modified_ranges;
     ObjectContextRef obc;
@@ -645,38 +653,45 @@ protected:
    * @return true on success, false if we are queued
    */
   bool get_rw_locks(OpContext *ctx) {
-    if (ctx->op->may_write() || ctx->op->may_cache()) {
-      /* If snapset_obc, !obc->obs->exists and we need to
-       * get a write lock on the snapdir as well as the
-       * head.  Fortunately, we are guarranteed to get a
-       * write lock on the head if !obc->obs->exists
-       */
-      if (ctx->snapset_obc) {
-	assert(!ctx->obc->obs.exists);
+    /* If snapset_obc, !obc->obs->exists and we will always take the
+     * snapdir lock *before* the head lock.  Since all callers will do
+     * this (read or write) if we get the first we will be guaranteed
+     * to get the second.
+     */
+    if (ctx->snapset_obc) {
+      assert(!ctx->obc->obs.exists);
+      if (ctx->op->may_write() || ctx->op->may_cache()) {
 	if (ctx->snapset_obc->get_write(ctx->op)) {
 	  ctx->release_snapset_obc = true;
 	  ctx->lock_to_release = OpContext::W_LOCK;
 	} else {
 	  return false;
 	}
-	// we are creating it and have the only ref
-	bool got = ctx->obc->get_write(ctx->op);
-	assert(got);
-	return true;
       } else {
-	if (ctx->obc->get_write(ctx->op)) {
-	  ctx->lock_to_release = OpContext::W_LOCK;
-	  return true;
+	assert(ctx->op->may_read());
+	if (ctx->snapset_obc->get_read(ctx->op)) {
+	  ctx->release_snapset_obc = true;
+	  ctx->lock_to_release = OpContext::R_LOCK;
 	} else {
 	  return false;
 	}
       }
+    }
+    if (ctx->op->may_write() || ctx->op->may_cache()) {
+      if (ctx->obc->get_write(ctx->op)) {
+	ctx->lock_to_release = OpContext::W_LOCK;
+	return true;
+      } else {
+	assert(!ctx->snapset_obc);
+	return false;
+      }
     } else {
       assert(ctx->op->may_read());
       if (ctx->obc->get_read(ctx->op)) {
 	ctx->lock_to_release = OpContext::R_LOCK;
 	return true;
       } else {
+	assert(!ctx->snapset_obc);
 	return false;
       }
     }
@@ -705,17 +720,33 @@ protected:
     bool requeue_recovery = false;
     bool requeue_recovery_clone = false;
     bool requeue_recovery_snapset = false;
-    if (ctx->snapset_obc && ctx->release_snapset_obc) {
-      ctx->snapset_obc->put_write(&to_req, &requeue_recovery_snapset);
-      ctx->release_snapset_obc = false;
-    }
+    bool requeue_snaptrimmer = false;
+    bool requeue_snaptrimmer_clone = false;
+    bool requeue_snaptrimmer_snapset = false;
     switch (ctx->lock_to_release) {
     case OpContext::W_LOCK:
-      ctx->obc->put_write(&to_req, &requeue_recovery);
+      if (ctx->snapset_obc && ctx->release_snapset_obc) {
+	ctx->snapset_obc->put_write(
+	  &to_req,
+	  &requeue_recovery_snapset,
+	  &requeue_snaptrimmer_snapset);
+	ctx->release_snapset_obc = false;
+      }
+      ctx->obc->put_write(
+	&to_req,
+	&requeue_recovery,
+	&requeue_snaptrimmer);
       if (ctx->clone_obc)
-	ctx->clone_obc->put_write(&to_req, &requeue_recovery_clone);
+	ctx->clone_obc->put_write(
+	  &to_req,
+	  &requeue_recovery_clone,
+	  &requeue_snaptrimmer_clone);
       break;
     case OpContext::R_LOCK:
+      if (ctx->snapset_obc && ctx->release_snapset_obc) {
+	ctx->snapset_obc->put_read(&to_req);
+	ctx->release_snapset_obc = false;
+      }
       ctx->obc->put_read(&to_req);
       break;
     case OpContext::NONE:
@@ -723,9 +754,14 @@ protected:
     default:
       assert(0);
     };
+    assert(ctx->release_snapset_obc == false);
     ctx->lock_to_release = OpContext::NONE;
     if (requeue_recovery || requeue_recovery_clone || requeue_recovery_snapset)
       osd->recovery_wq.queue(this);
+    if (requeue_snaptrimmer ||
+	requeue_snaptrimmer_clone ||
+	requeue_snaptrimmer_snapset)
+      queue_snap_trim();
     requeue_ops(to_req);
   }
 
@@ -864,6 +900,7 @@ protected:
   int find_object_context(const hobject_t& oid,
 			  ObjectContextRef *pobc,
 			  bool can_create,
+			  bool map_snapid_to_clone=false,
 			  hobject_t *missing_oid=NULL);
 
   void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat);
@@ -990,7 +1027,7 @@ protected:
     const hobject_t& head, const hobject_t& coid,
     object_info_t *poi);
   void execute_ctx(OpContext *ctx);
-  void finish_ctx(OpContext *ctx, int log_op_type);
+  void finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc=true);
   void reply_ctx(OpContext *ctx, int err);
   void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv);
   void make_writeable(OpContext *ctx);
@@ -1184,12 +1221,15 @@ protected:
   // -- flush --
   map<hobject_t, FlushOpRef> flush_ops;
 
-  int start_flush(OpContext *ctx, bool blocking);
+  int start_flush(OpContext *ctx, bool blocking, hobject_t *pmissing);
   void finish_flush(hobject_t oid, ceph_tid_t tid, int r);
   int try_flush_mark_clean(FlushOpRef fop);
   void cancel_flush(FlushOpRef fop, bool requeue);
   void cancel_flush_ops(bool requeue);
 
+  /// @return false if clone is has been evicted
+  bool is_present_clone(hobject_t coid);
+
   friend struct C_Flush;
 
   // -- scrub --
@@ -1322,10 +1362,6 @@ private:
   int _delete_oid(OpContext *ctx, bool no_whiteout);
   int _rollback_to(OpContext *ctx, ceph_osd_op& op);
 public:
-  bool same_for_read_since(epoch_t e);
-  bool same_for_modify_since(epoch_t e);
-  bool same_for_rep_modify_since(epoch_t e);
-
   bool is_missing_object(const hobject_t& oid) const;
   bool is_unreadable_object(const hobject_t &oid) const {
     return is_missing_object(oid) ||
@@ -1337,6 +1373,7 @@ public:
   bool is_degraded_object(const hobject_t& oid);
   void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
 
+  bool maybe_await_blocked_snapset(const hobject_t &soid, OpRequestRef op);
   void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
   void kick_object_context_blocked(ObjectContextRef obc);
 
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index c183dca..f2cf9cd 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -46,6 +46,7 @@ const char *ceph_osd_flag_name(unsigned flag)
   case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
   case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
   case CEPH_OSD_FLAG_FLUSH: return "flush";
+  case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
   default: return "???";
   }
 }
@@ -82,7 +83,11 @@ void pg_shard_t::decode(bufferlist::iterator &bl)
 
 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
 {
-  return lhs << '(' << rhs.osd << ',' << (unsigned)(rhs.shard) << ')';
+  if (rhs.is_undefined())
+    return lhs << "?";
+  if (rhs.shard == ghobject_t::NO_SHARD)
+    return lhs << rhs.osd;
+  return lhs << rhs.osd << '(' << (unsigned)(rhs.shard) << ')';
 }
 
 // -- osd_reqid_t --
@@ -389,6 +394,15 @@ ostream& operator<<(ostream& out, const spg_t &pg)
   return out;
 }
 
+pg_t pg_t::get_ancestor(unsigned old_pg_num) const
+{
+  int old_bits = pg_pool_t::calc_bits_of(old_pg_num);
+  int old_mask = (1 << old_bits) - 1;
+  pg_t ret = *this;
+  ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
+  return ret;
+}
+
 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
 {
   assert(m_seed < old_pg_num);
@@ -1306,11 +1320,13 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_objects_recovered", num_objects_recovered);
   f->dump_int("num_bytes_recovered", num_bytes_recovered);
   f->dump_int("num_keys_recovered", num_keys_recovered);
+  f->dump_int("num_objects_omap", num_objects_omap);
+  f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
 }
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(7, 3, bl);
+  ENCODE_START(9, 3, bl);
   ::encode(num_bytes, bl);
   ::encode(num_objects, bl);
   ::encode(num_object_clones, bl);
@@ -1330,12 +1346,14 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_deep_scrub_errors, bl);
   ::encode(num_objects_dirty, bl);
   ::encode(num_whiteouts, bl);
+  ::encode(num_objects_omap, bl);
+  ::encode(num_objects_hit_set_archive, bl);
   ENCODE_FINISH(bl);
 }
 
 void object_stat_sum_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(9, 3, 3, bl);
   ::decode(num_bytes, bl);
   if (struct_v < 3) {
     uint64_t num_kb;
@@ -1379,6 +1397,16 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
     num_objects_dirty = 0;
     num_whiteouts = 0;
   }
+  if (struct_v >= 8) {
+    ::decode(num_objects_omap, bl);
+  } else {
+    num_objects_omap = 0;
+  }
+  if (struct_v >= 9) {
+    ::decode(num_objects_hit_set_archive, bl);
+  } else {
+    num_objects_hit_set_archive = 0;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -1428,6 +1456,8 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_keys_recovered += o.num_keys_recovered;
   num_objects_dirty += o.num_objects_dirty;
   num_whiteouts += o.num_whiteouts;
+  num_objects_omap += o.num_objects_omap;
+  num_objects_hit_set_archive += o.num_objects_hit_set_archive;
 }
 
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
@@ -1451,6 +1481,8 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_keys_recovered -= o.num_keys_recovered;
   num_objects_dirty -= o.num_objects_dirty;
   num_whiteouts -= o.num_whiteouts;
+  num_objects_omap -= o.num_objects_omap;
+  num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
 }
 
 
@@ -1560,7 +1592,7 @@ void pg_stat_t::dump_brief(Formatter *f) const
 
 void pg_stat_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(15, 8, bl);
+  ENCODE_START(17, 8, bl);
   ::encode(version, bl);
   ::encode(reported_seq, bl);
   ::encode(reported_epoch, bl);
@@ -1592,12 +1624,14 @@ void pg_stat_t::encode(bufferlist &bl) const
   ::encode(dirty_stats_invalid, bl);
   ::encode(up_primary, bl);
   ::encode(acting_primary, bl);
+  ::encode(omap_stats_invalid, bl);
+  ::encode(hitset_stats_invalid, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_stat_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(15, 8, 8, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
   ::decode(version, bl);
   ::decode(reported_seq, bl);
   ::decode(reported_epoch, bl);
@@ -1691,6 +1725,20 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
     up_primary = up.size() ? up[0] : -1;
     acting_primary = acting.size() ? acting[0] : -1;
   }
+  if (struct_v >= 16) {
+    ::decode(omap_stats_invalid, bl);
+  } else {
+    // if we are decoding an old encoding of this object, then the
+    // encoder may not have supported num_objects_omap accounting.
+    omap_stats_invalid = true;
+  }
+  if (struct_v >= 17) {
+    ::decode(hitset_stats_invalid, bl);
+  } else {
+    // if we are decoding an old encoding of this object, then the
+    // encoder may not have supported num_objects_hit_set_archive accounting.
+    hitset_stats_invalid = true;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -2074,19 +2122,20 @@ ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
 
 void pg_interval_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(3, 2, bl);
+  ENCODE_START(4, 2, bl);
   ::encode(first, bl);
   ::encode(last, bl);
   ::encode(up, bl);
   ::encode(acting, bl);
   ::encode(maybe_went_rw, bl);
   ::encode(primary, bl);
+  ::encode(up_primary, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_interval_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
   ::decode(first, bl);
   ::decode(last, bl);
   ::decode(up, bl);
@@ -2098,6 +2147,12 @@ void pg_interval_t::decode(bufferlist::iterator& bl)
     if (acting.size())
       primary = acting[0];
   }
+  if (struct_v >= 4) {
+    ::decode(up_primary, bl);
+  } else {
+    if (up.size())
+      up_primary = up[0];
+  }
   DECODE_FINISH(bl);
 }
 
@@ -2113,6 +2168,8 @@ void pg_interval_t::dump(Formatter *f) const
   f->open_array_section("acting");
   for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
     f->dump_int("osd", *p);
+  f->dump_int("primary", primary);
+  f->dump_int("up_primary", up_primary);
   f->close_section();
 }
 
@@ -2129,10 +2186,12 @@ void pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
 }
 
 bool pg_interval_t::check_new_interval(
-  int old_primary,
-  int new_primary,
+  int old_acting_primary,
+  int new_acting_primary,
   const vector<int> &old_acting,
   const vector<int> &new_acting,
+  int old_up_primary,
+  int new_up_primary,
   const vector<int> &old_up,
   const vector<int> &new_up,
   epoch_t same_interval_since,
@@ -2145,8 +2204,13 @@ bool pg_interval_t::check_new_interval(
   std::ostream *out)
 {
   // remember past interval
-  if (old_primary != new_primary ||
-      new_acting != old_acting || new_up != old_up ||
+  //  NOTE: a change in the up set primary triggers an interval
+  //  change, even though the interval members in the pg_interval_t
+  //  do not change.
+  if (old_acting_primary != new_acting_primary ||
+      new_acting != old_acting ||
+      old_up_primary != new_up_primary ||
+      new_up != old_up ||
       (!(lastmap->get_pools().count(pool_id))) ||
       (lastmap->get_pools().find(pool_id)->second.min_size !=
        osdmap->get_pools().find(pool_id)->second.min_size)  ||
@@ -2157,7 +2221,8 @@ bool pg_interval_t::check_new_interval(
     i.last = osdmap->get_epoch() - 1;
     i.acting = old_acting;
     i.up = old_up;
-    i.primary = old_primary;
+    i.primary = old_acting_primary;
+    i.up_primary = old_up_primary;
 
     if (!i.acting.empty() && i.primary != -1 &&
 	i.acting.size() >=
@@ -2215,7 +2280,9 @@ bool pg_interval_t::check_new_interval(
 
 ostream& operator<<(ostream& out, const pg_interval_t& i)
 {
-  out << "interval(" << i.first << "-" << i.last << " " << i.up << "/" << i.acting;
+  out << "interval(" << i.first << "-" << i.last
+      << " up " << i.up << "(" << i.up_primary << ")"
+      << " acting " << i.acting << "(" << i.primary << ")";
   if (i.maybe_went_rw)
     out << " maybe_went_rw";
   out << ")";
@@ -3448,6 +3515,21 @@ void SnapSet::from_snap_set(const librados::snap_set_t& ss)
     snaps.push_back(*p);
 }
 
+uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
+{
+  assert(clone_size.count(clone));
+  uint64_t size = clone_size.find(clone)->second;
+  assert(clone_overlap.count(clone));
+  const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
+  for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+       i != overlap.end();
+       ++i) {
+    assert(size >= i.get_len());
+    size -= i.get_len();
+  }
+  return size;
+}
+
 // -- watch_info_t --
 
 void watch_info_t::encode(bufferlist& bl) const
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index e055281..092d6cc 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -311,6 +311,7 @@ struct pg_t {
   }
 
   pg_t get_parent() const;
+  pg_t get_ancestor(unsigned old_pg_num) const;
 
   int print(char *o, int maxlen) const;
   bool parse(const char *s);
@@ -1124,6 +1125,8 @@ struct object_stat_sum_t {
   int64_t num_keys_recovered;
   int64_t num_objects_dirty;
   int64_t num_whiteouts;
+  int64_t num_objects_omap;
+  int64_t num_objects_hit_set_archive;
 
   object_stat_sum_t()
     : num_bytes(0),
@@ -1136,7 +1139,9 @@ struct object_stat_sum_t {
       num_bytes_recovered(0),
       num_keys_recovered(0),
       num_objects_dirty(0),
-      num_whiteouts(0)
+      num_whiteouts(0),
+      num_objects_omap(0),
+      num_objects_hit_set_archive(0)
   {}
 
   void floor(int64_t f) {
@@ -1160,9 +1165,44 @@ struct object_stat_sum_t {
     FLOOR(num_keys_recovered);
     FLOOR(num_objects_dirty);
     FLOOR(num_whiteouts);
+    FLOOR(num_objects_omap);
+    FLOOR(num_objects_hit_set_archive);
 #undef FLOOR
   }
 
+  void split(vector<object_stat_sum_t> &out) const {
+#define SPLIT(PARAM)                            \
+    for (unsigned i = 0; i < out.size(); ++i) { \
+      out[i].PARAM = PARAM / out.size();        \
+      if (i < (PARAM % out.size())) {           \
+	out[i].PARAM++;                         \
+      }                                         \
+    }                                           \
+
+    SPLIT(num_bytes);
+    SPLIT(num_objects);
+    SPLIT(num_object_clones);
+    SPLIT(num_object_copies);
+    SPLIT(num_objects_missing_on_primary);
+    SPLIT(num_objects_degraded);
+    SPLIT(num_objects_unfound);
+    SPLIT(num_rd);
+    SPLIT(num_rd_kb);
+    SPLIT(num_wr);
+    SPLIT(num_wr_kb);
+    SPLIT(num_scrub_errors);
+    SPLIT(num_shallow_scrub_errors);
+    SPLIT(num_deep_scrub_errors);
+    SPLIT(num_objects_recovered);
+    SPLIT(num_bytes_recovered);
+    SPLIT(num_keys_recovered);
+    SPLIT(num_objects_dirty);
+    SPLIT(num_whiteouts);
+    SPLIT(num_objects_omap);
+    SPLIT(num_objects_hit_set_archive);
+#undef SPLIT
+  }
+
   void clear() {
     memset(this, 0, sizeof(*this));
   }
@@ -1290,6 +1330,8 @@ struct pg_stat_t {
   /// true if num_objects_dirty is not accurate (because it was not
   /// maintained starting from pool creation)
   bool dirty_stats_invalid;
+  bool omap_stats_invalid;
+  bool hitset_stats_invalid;
 
   /// up, acting primaries
   int up_primary;
@@ -1305,6 +1347,8 @@ struct pg_stat_t {
       log_size(0), ondisk_log_size(0),
       mapping_epoch(0),
       dirty_stats_invalid(false),
+      omap_stats_invalid(false),
+      hitset_stats_invalid(false),
       up_primary(-1),
       acting_primary(-1)
   { }
@@ -1645,8 +1689,14 @@ struct pg_interval_t {
   epoch_t first, last;
   bool maybe_went_rw;
   int primary;
+  int up_primary;
 
-  pg_interval_t() : first(0), last(0), maybe_went_rw(false), primary(-1) {}
+  pg_interval_t()
+    : first(0), last(0),
+      maybe_went_rw(false),
+      primary(-1),
+      up_primary(-1)
+  {}
 
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
@@ -1658,10 +1708,12 @@ struct pg_interval_t {
    * if an interval was closed out.
    */
   static bool check_new_interval(
-    int old_primary,                            ///< [in] primary as of lastmap
-    int new_primary,                            ///< [in] primary as of lastmap
+    int old_acting_primary,                     ///< [in] primary as of lastmap
+    int new_acting_primary,                     ///< [in] primary as of lastmap
     const vector<int> &old_acting,              ///< [in] acting as of lastmap
     const vector<int> &new_acting,              ///< [in] acting as of osdmap
+    int old_up_primary,                         ///< [in] up primary of lastmap
+    int new_up_primary,                         ///< [in] up primary of osdmap
     const vector<int> &old_up,                  ///< [in] up as of lastmap
     const vector<int> &new_up,                  ///< [in] up as of osdmap
     epoch_t same_interval_since,                ///< [in] as of osdmap
@@ -2428,6 +2480,9 @@ struct SnapSet {
 
   /// populate SnapSet from a librados::snap_set_t
   void from_snap_set(const librados::snap_set_t& ss);
+
+  /// get space accounted to clone
+  uint64_t get_clone_bytes(snapid_t clone) const;
     
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
@@ -2553,6 +2608,9 @@ struct object_info_t {
   bool is_dirty() const {
     return test_flag(FLAG_DIRTY);
   }
+  bool is_omap() const {
+    return test_flag(FLAG_OMAP);
+  }
 
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
@@ -2595,8 +2653,10 @@ struct SnapSetContext {
   int ref;
   bool registered;
   SnapSet snapset;
+  bool exists;
 
-  SnapSetContext(const hobject_t& o) : oid(o), ref(0), registered(false) { }
+  SnapSetContext(const hobject_t& o) :
+    oid(o), ref(0), registered(false), exists(true) { }
 };
 
 
@@ -2659,7 +2719,15 @@ public:
     /// if set, restart backfill when we can get a read lock
     bool backfill_read_marker;
 
-    RWState() : state(RWNONE), count(0), backfill_read_marker(false) {}
+    /// if set, requeue snaptrim on lock release
+    bool snaptrimmer_write_marker;
+
+    RWState()
+      : state(RWNONE),
+	count(0),
+	backfill_read_marker(false),
+	snaptrimmer_write_marker(false)
+    {}
     bool get_read(OpRequestRef op) {
       if (get_read_lock()) {
 	return true;
@@ -2752,6 +2820,14 @@ public:
   bool get_write(OpRequestRef op) {
     return rwstate.get_write(op);
   }
+  bool get_snaptrimmer_write() {
+    if (rwstate.get_write_lock()) {
+      return true;
+    } else {
+      rwstate.snaptrimmer_write_marker = true;
+      return false;
+    }
+  }
   bool get_backfill_read() {
     rwstate.backfill_read_marker = true;
     if (rwstate.get_read_lock()) {
@@ -2768,12 +2844,17 @@ public:
     rwstate.put_read(to_wake);
   }
   void put_write(list<OpRequestRef> *to_wake,
-		 bool *requeue_recovery) {
+		 bool *requeue_recovery,
+		 bool *requeue_snaptrimmer) {
     rwstate.put_write(to_wake);
     if (rwstate.empty() && rwstate.backfill_read_marker) {
       rwstate.backfill_read_marker = false;
       *requeue_recovery = true;
     }
+    if (rwstate.empty() && rwstate.snaptrimmer_write_marker) {
+      rwstate.snaptrimmer_write_marker = false;
+      *requeue_snaptrimmer = true;
+    }
   }
 
   ObjectContext()
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 6ecfe82..3e41a5c 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -443,6 +443,7 @@ void ObjectCacher::Object::truncate(loff_t s)
 
     // remove bh entirely
     assert(bh->start() >= s);
+    assert(bh->waitfor_read.empty());
     oc->bh_remove(this, bh);
     delete bh;
   }
@@ -482,6 +483,7 @@ void ObjectCacher::Object::discard(loff_t off, loff_t len)
 
     ++p;
     ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl;
+    assert(bh->waitfor_read.empty());
     oc->bh_remove(this, bh);
     delete bh;
   }
@@ -1487,6 +1489,18 @@ void ObjectCacher::flusher_entry()
 
 // -------------------------------------------------
 
+bool ObjectCacher::set_is_empty(ObjectSet *oset)
+{
+  assert(lock.is_locked());
+  if (oset->objects.empty())
+    return true;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin(); !p.end(); ++p)
+    if (!(*p)->is_empty())
+      return false;
+
+  return true;
+}
 
 bool ObjectCacher::set_is_cached(ObjectSet *oset)
 {
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index 01d18ce..d2aebe9 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -591,6 +591,7 @@ private:
   bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
 
 public:
+  bool set_is_empty(ObjectSet *oset);
   bool set_is_cached(ObjectSet *oset);
   bool set_is_dirty_or_committing(ObjectSet *oset);
 
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index f876262..9da65b0 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -45,6 +45,7 @@
 #include "common/config.h"
 #include "common/perf_counters.h"
 #include "include/str_list.h"
+#include "common/errno.h"
 
 
 #define dout_subsys ceph_subsys_objecter
@@ -249,7 +250,7 @@ void Objecter::init_unlocked()
 					   "show in-progress osd requests");
   if (ret < 0) {
     lderr(cct) << "error registering admin socket command: "
-	       << cpp_strerror(-ret) << dendl;
+	       << cpp_strerror(ret) << dendl;
   }
 }
 
@@ -305,7 +306,8 @@ void Objecter::send_linger(LingerOp *info)
   vector<OSDOp> opv = info->ops; // need to pass a copy to ops
   Context *onack = (!info->registered && info->on_reg_ack) ? new C_Linger_Ack(this, info) : NULL;
   Context *oncommit = new C_Linger_Commit(this, info);
-  Op *o = new Op(info->oid, info->oloc, opv, info->flags | CEPH_OSD_FLAG_READ,
+  Op *o = new Op(info->target.base_oid, info->target.base_oloc,
+		 opv, info->target.flags | CEPH_OSD_FLAG_READ,
 		 onack, oncommit,
 		 info->pobjver);
   o->snapid = info->snap;
@@ -391,13 +393,13 @@ ceph_tid_t Objecter::linger_mutate(const object_t& oid, const object_locator_t&
 			      version_t *objver)
 {
   LingerOp *info = new LingerOp;
-  info->oid = oid;
-  info->oloc = oloc;
-  if (info->oloc.key == oid)
-    info->oloc.key.clear();
+  info->target.base_oid = oid;
+  info->target.base_oloc = oloc;
+  if (info->target.base_oloc.key == oid)
+    info->target.base_oloc.key.clear();
   info->snapc = snapc;
   info->mtime = mtime;
-  info->flags = flags | CEPH_OSD_FLAG_WRITE;
+  info->target.flags = flags | CEPH_OSD_FLAG_WRITE;
   info->ops = op.ops;
   info->inbl = inbl;
   info->poutbl = NULL;
@@ -422,12 +424,12 @@ ceph_tid_t Objecter::linger_read(const object_t& oid, const object_locator_t& ol
 			    version_t *objver)
 {
   LingerOp *info = new LingerOp;
-  info->oid = oid;
-  info->oloc = oloc;
-  if (info->oloc.key == oid)
-    info->oloc.key.clear();
+  info->target.base_oid = oid;
+  info->target.base_oloc = oloc;
+  if (info->target.base_oloc.key == oid)
+    info->target.base_oloc.key.clear();
   info->snap = snap;
-  info->flags = flags;
+  info->target.flags = flags;
   info->ops = op.ops;
   info->inbl = inbl;
   info->poutbl = poutbl;
@@ -515,7 +517,7 @@ void Objecter::scan_requests(bool force_resend,
     switch (r) {
     case RECALC_OP_TARGET_NO_ACTION:
       if (!force_resend &&
-	  (!force_resend_writes || !(op->flags & CEPH_OSD_FLAG_WRITE)))
+	  (!force_resend_writes || !(op->target.flags & CEPH_OSD_FLAG_WRITE)))
 	break;
       // -- fall-thru --
     case RECALC_OP_TARGET_NEED_RESEND:
@@ -665,7 +667,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
   for (map<ceph_tid_t, Op*>::iterator p = need_resend.begin(); p != need_resend.end(); ++p) {
     Op *op = p->second;
     if (op->should_resend) {
-      if (op->session && !op->paused) {
+      if (op->session && !op->target.paused) {
 	logger->inc(l_osdc_op_resend);
 	send_op(op);
       }
@@ -750,7 +752,7 @@ void Objecter::check_op_pool_dne(Op *op)
     if (osdmap->get_epoch() >= op->map_dne_bound) {
       // we had a new enough map
       ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
-		     << " concluding pool " << op->pgid.pool() << " dne"
+		     << " concluding pool " << op->target.base_pgid.pool() << " dne"
 		     << dendl;
       if (op->onack) {
 	op->onack->complete(-ENOENT);
@@ -1049,7 +1051,7 @@ void Objecter::kick_requests(OSDSession *session)
     ++p;
     logger->inc(l_osdc_op_resend);
     if (op->should_resend) {
-      if (!op->paused)
+      if (!op->target.paused)
 	resend[op->tid] = op;
     } else {
       cancel_linger_op(op);
@@ -1267,14 +1269,14 @@ ceph_tid_t Objecter::_op_submit(Op *op)
   logger->set(l_osdc_op_active, ops.size());
 
   logger->inc(l_osdc_op);
-  if ((op->flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE))
+  if ((op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE))
     logger->inc(l_osdc_op_rmw);
-  else if (op->flags & CEPH_OSD_FLAG_WRITE)
+  else if (op->target.flags & CEPH_OSD_FLAG_WRITE)
     logger->inc(l_osdc_op_w);
-  else if (op->flags & CEPH_OSD_FLAG_READ)
+  else if (op->target.flags & CEPH_OSD_FLAG_READ)
     logger->inc(l_osdc_op_r);
 
-  if (op->flags & CEPH_OSD_FLAG_PGOP)
+  if (op->target.flags & CEPH_OSD_FLAG_PGOP)
     logger->inc(l_osdc_op_pg);
 
   for (vector<OSDOp>::iterator p = op->ops.begin(); p != op->ops.end(); ++p) {
@@ -1310,27 +1312,27 @@ ceph_tid_t Objecter::_op_submit(Op *op)
   }
 
   // send?
-  ldout(cct, 10) << "op_submit oid " << op->base_oid
-           << " " << op->base_oloc << " " << op->target_oloc
+  ldout(cct, 10) << "op_submit oid " << op->target.base_oid
+           << " " << op->target.base_oloc << " " << op->target.target_oloc
 	   << " " << op->ops << " tid " << op->tid
            << " osd." << (op->session ? op->session->osd : -1)
            << dendl;
 
-  assert(op->flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));
+  assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));
 
-  if ((op->flags & CEPH_OSD_FLAG_WRITE) &&
+  if ((op->target.flags & CEPH_OSD_FLAG_WRITE) &&
       osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
     ldout(cct, 10) << " paused modify " << op << " tid " << last_tid << dendl;
-    op->paused = true;
+    op->target.paused = true;
     maybe_request_map();
-  } else if ((op->flags & CEPH_OSD_FLAG_READ) &&
+  } else if ((op->target.flags & CEPH_OSD_FLAG_READ) &&
 	     osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
     ldout(cct, 10) << " paused read " << op << " tid " << last_tid << dendl;
-    op->paused = true;
+    op->target.paused = true;
     maybe_request_map();
-  } else if ((op->flags & CEPH_OSD_FLAG_WRITE) && osdmap_full_flag()) {
+  } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) && osdmap_full_flag()) {
     ldout(cct, 0) << " FULL, paused modify " << op << " tid " << last_tid << dendl;
-    op->paused = true;
+    op->target.paused = true;
     maybe_request_map();
   } else if (op->session) {
     send_op(op);
@@ -1391,13 +1393,13 @@ bool Objecter::is_pg_changed(
   return false;      // same primary (tho replicas may have changed)
 }
 
-bool Objecter::op_should_be_paused(Op *op)
+bool Objecter::target_should_be_paused(op_target_t *t)
 {
   bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
   bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || osdmap_full_flag();
 
-  return (op->flags & CEPH_OSD_FLAG_READ && pauserd) ||
-         (op->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+  return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+         (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
 }
 
 
@@ -1430,42 +1432,42 @@ int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
   return p->raw_hash_to_pg(p->hash_key(key, ns));
 }
 
-int Objecter::recalc_op_target(Op *op)
+int Objecter::calc_target(op_target_t *t)
 {
-  bool is_read = op->flags & CEPH_OSD_FLAG_READ;
-  bool is_write = op->flags & CEPH_OSD_FLAG_WRITE;
+  bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+  bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
 
   bool need_check_tiering = false;
-  if (op->target_oid.name.empty()) {
-    op->target_oid = op->base_oid;
+  if (t->target_oid.name.empty()) {
+    t->target_oid = t->base_oid;
     need_check_tiering = true;
   }
-  if (op->target_oloc.empty()) {
-    op->target_oloc = op->base_oloc;
+  if (t->target_oloc.empty()) {
+    t->target_oloc = t->base_oloc;
     need_check_tiering = true;
   }
   
   if (need_check_tiering &&
-      (op->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-    const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool);
+      (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+    const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
     if (pi) {
       if (is_read && pi->has_read_tier())
-	op->target_oloc.pool = pi->read_tier;
+	t->target_oloc.pool = pi->read_tier;
       if (is_write && pi->has_write_tier())
-	op->target_oloc.pool = pi->write_tier;
+	t->target_oloc.pool = pi->write_tier;
     }
   }
 
   pg_t pgid;
-  if (op->precalc_pgid) {
-    assert(op->base_oid.name.empty()); // make sure this is a listing op
-    ldout(cct, 10) << "recalc_op_target have " << op->base_pgid << " pool "
-		   << osdmap->have_pg_pool(op->base_pgid.pool()) << dendl;
-    if (!osdmap->have_pg_pool(op->base_pgid.pool()))
+  if (t->precalc_pgid) {
+    assert(t->base_oid.name.empty()); // make sure this is a listing op
+    ldout(cct, 10) << __func__ << " have " << t->base_pgid << " pool "
+		   << osdmap->have_pg_pool(t->base_pgid.pool()) << dendl;
+    if (!osdmap->have_pg_pool(t->base_pgid.pool()))
       return RECALC_OP_TARGET_POOL_DNE;
-    pgid = osdmap->raw_pg_to_pg(op->base_pgid);
+    pgid = osdmap->raw_pg_to_pg(t->base_pgid);
   } else {
-    int ret = osdmap->object_locator_to_pg(op->target_oid, op->target_oloc,
+    int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc,
 					   pgid);
     if (ret == -ENOENT)
       return RECALC_OP_TARGET_POOL_DNE;
@@ -1476,33 +1478,32 @@ int Objecter::recalc_op_target(Op *op)
 
   bool need_resend = false;
 
-  bool paused = op_should_be_paused(op);
-  if (!paused && paused != op->paused) {
-    op->paused = false;
+  bool paused = target_should_be_paused(t);
+  if (!paused && paused != t->paused) {
+    t->paused = false;
     need_resend = true;
   }
 
-  if (op->pgid != pgid ||
-      is_pg_changed(
-	op->primary, op->acting, primary, acting, op->used_replica)) {
-    op->pgid = pgid;
-    op->acting = acting;
-    op->primary = primary;
-    ldout(cct, 10) << "recalc_op_target tid " << op->tid
-	     << " pgid " << pgid << " acting " << acting << dendl;
-
-    OSDSession *s = NULL;
-    op->used_replica = false;
-    if (primary != -1) {
+  if (t->pgid != pgid ||
+      is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica)) {
+    t->pgid = pgid;
+    t->acting = acting;
+    t->primary = primary;
+    ldout(cct, 10) << __func__ << " pgid " << pgid
+		   << " acting " << acting << dendl;
+    t->used_replica = false;
+    if (primary == -1) {
+      t->osd = -1;
+    } else {
       int osd;
       bool read = is_read && !is_write;
-      if (read && (op->flags & CEPH_OSD_FLAG_BALANCE_READS)) {
+      if (read && (t->flags & CEPH_OSD_FLAG_BALANCE_READS)) {
 	int p = rand() % acting.size();
 	if (p)
-	  op->used_replica = true;
+	  t->used_replica = true;
 	osd = acting[p];
 	ldout(cct, 10) << " chose random osd." << osd << " of " << acting << dendl;
-      } else if (read && (op->flags & CEPH_OSD_FLAG_LOCALIZE_READS) &&
+      } else if (read && (t->flags & CEPH_OSD_FLAG_LOCALIZE_READS) &&
 		 acting.size() > 1) {
 	// look for a local replica.  prefer the primary if the
 	// distance is the same.
@@ -1521,7 +1522,7 @@ int Objecter::recalc_op_target(Op *op)
 	    best = i;
 	    best_locality = locality;
 	    if (i)
-	      op->used_replica = true;
+	      t->used_replica = true;
 	  }
 	}
 	assert(best >= 0);
@@ -1529,9 +1530,23 @@ int Objecter::recalc_op_target(Op *op)
       } else {
 	osd = primary;
       }
-      s = get_session(osd);
+      t->osd = osd;
     }
+    need_resend = true;
+  }
+  if (need_resend) {
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  return RECALC_OP_TARGET_NO_ACTION;
+}
 
+int Objecter::recalc_op_target(Op *op)
+{
+  int r = calc_target(&op->target);
+  if (r == RECALC_OP_TARGET_NEED_RESEND) {
+    OSDSession *s = NULL;
+    if (op->target.osd >= 0)
+      s = get_session(op->target.osd);
     if (op->session != s) {
       if (!op->session)
 	num_homeless_ops--;
@@ -1542,44 +1557,28 @@ int Objecter::recalc_op_target(Op *op)
       else
 	num_homeless_ops++;
     }
-    need_resend = true;
-  }
-  if (need_resend) {
-    return RECALC_OP_TARGET_NEED_RESEND;
   }
-  return RECALC_OP_TARGET_NO_ACTION;
+  return r;
 }
 
 bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
 {
-  int primary;
-  vector<int> acting;
-  pg_t pgid;
-  int ret = osdmap->object_locator_to_pg(linger_op->oid, linger_op->oloc, pgid);
-  if (ret == -ENOENT) {
-    return RECALC_OP_TARGET_POOL_DNE;
-  }
-  osdmap->pg_to_acting_osds(pgid, &acting, &primary);
-
-  if (pgid != linger_op->pgid ||
-      is_pg_changed(
-        linger_op->primary, linger_op->acting, primary, acting, true)) {
-    linger_op->pgid = pgid;
-    linger_op->acting = acting;
-    linger_op->primary = primary;
+  int r = calc_target(&linger_op->target);
+  if (r == RECALC_OP_TARGET_NEED_RESEND) {
     ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
-	     << " pgid " << pgid << " acting " << acting << dendl;
+		   << " pgid " << linger_op->target.pgid
+		   << " acting " << linger_op->target.acting << dendl;
     
-    OSDSession *s = primary != -1 ? get_session(primary) : NULL;
+    OSDSession *s = linger_op->target.osd != -1 ?
+      get_session(linger_op->target.osd) : NULL;
     if (linger_op->session != s) {
       linger_op->session_item.remove_myself();
       linger_op->session = s;
       if (s)
 	s->linger_ops.push_back(&linger_op->session_item);
     }
-    return RECALC_OP_TARGET_NEED_RESEND;
   }
-  return RECALC_OP_TARGET_NO_ACTION;
+  return r;
 }
 
 void Objecter::cancel_linger_op(Op *op)
@@ -1615,7 +1614,7 @@ void Objecter::send_op(Op *op)
 {
   ldout(cct, 15) << "send_op " << op->tid << " to osd." << op->session->osd << dendl;
 
-  int flags = op->flags;
+  int flags = op->target.flags;
   if (op->oncommit)
     flags |= CEPH_OSD_FLAG_ONDISK;
   if (op->onack)
@@ -1634,12 +1633,13 @@ void Objecter::send_op(Op *op)
     op->con->post_rx_buffer(op->tid, *op->outbl);
   }
 
-  op->paused = false;
+  op->target.paused = false;
   op->incarnation = op->session->incarnation;
   op->stamp = ceph_clock_now(cct);
 
   MOSDOp *m = new MOSDOp(client_inc, op->tid, 
-			 op->target_oid, op->target_oloc, op->pgid,
+			 op->target.target_oid, op->target.target_oloc,
+			 op->target.pgid,
 			 osdmap->get_epoch(),
 			 flags);
 
@@ -1759,7 +1759,8 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
   if (m->is_redirect_reply()) {
     ldout(cct, 5) << " got redirect reply; redirecting" << dendl;
     unregister_op(op);
-    m->get_redirect().combine_with_locator(op->target_oloc, op->target_oid.name);
+    m->get_redirect().combine_with_locator(op->target.target_oloc,
+					   op->target.target_oid.name);
     _op_submit(op);
     m->put();
     return;
@@ -2519,13 +2520,28 @@ void Objecter::ms_handle_remote_reset(Connection *con)
 }
 
 
+void Objecter::op_target_t::dump(Formatter *f) const
+{
+  f->dump_stream("pg") << pgid;
+  f->dump_int("osd", osd);
+  f->dump_stream("object_id") << base_oid;
+  f->dump_stream("object_locator") << base_oloc;
+  f->dump_stream("target_object_id") << target_oid;
+  f->dump_stream("target_object_locator") << target_oloc;
+  f->dump_int("paused", (int)paused);
+  f->dump_int("used_replica", (int)used_replica);
+  f->dump_int("precalc_pgid", (int)precalc_pgid);
+}
+
 void Objecter::dump_active()
 {
   ldout(cct, 20) << "dump_active .. " << num_homeless_ops << " homeless" << dendl;
   for (map<ceph_tid_t,Op*>::iterator p = ops.begin(); p != ops.end(); ++p) {
     Op *op = p->second;
-    ldout(cct, 20) << op->tid << "\t" << op->pgid << "\tosd." << (op->session ? op->session->osd : -1)
-	    << "\t" << op->base_oid << "\t" << op->ops << dendl;
+    ldout(cct, 20) << op->tid << "\t" << op->target.pgid
+		   << "\tosd." << (op->session ? op->session->osd : -1)
+		   << "\t" << op->target.base_oid
+		   << "\t" << op->ops << dendl;
   }
 }
 
@@ -2552,13 +2568,9 @@ void Objecter::dump_ops(Formatter *fmt) const
     Op *op = p->second;
     fmt->open_object_section("op");
     fmt->dump_unsigned("tid", op->tid);
-    fmt->dump_stream("pg") << op->pgid;
-    fmt->dump_int("osd", op->session ? op->session->osd : -1);
+    op->target.dump(fmt);
     fmt->dump_stream("last_sent") << op->stamp;
     fmt->dump_int("attempts", op->attempts);
-    fmt->dump_stream("object_id") << op->base_oid;
-    fmt->dump_stream("object_locator") << op->base_oloc;
-    fmt->dump_stream("target_object_locator") << op->target_oloc;
     fmt->dump_stream("snapid") << op->snapid;
     fmt->dump_stream("snap_context") << op->snapc;
     fmt->dump_stream("mtime") << op->mtime;
@@ -2585,10 +2597,7 @@ void Objecter::dump_linger_ops(Formatter *fmt) const
     LingerOp *op = p->second;
     fmt->open_object_section("linger_op");
     fmt->dump_unsigned("linger_id", op->linger_id);
-    fmt->dump_stream("pg") << op->pgid;
-    fmt->dump_int("osd", op->session ? op->session->osd : -1);
-    fmt->dump_stream("object_id") << op->oid;
-    fmt->dump_stream("object_locator") << op->oloc;
+    op->target.dump(fmt);
     fmt->dump_stream("snapid") << op->snap;
     fmt->dump_stream("registered") << op->registered;
     fmt->close_section(); // linger_op object
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index d18e653..1e6fcf3 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -1061,11 +1061,8 @@ public:
 
   struct OSDSession;
 
-  struct Op {
-    OSDSession *session;
-    xlist<Op*>::item session_item;
-    int incarnation;
-    
+  struct op_target_t {
+    int flags;
     object_t base_oid;
     object_locator_t base_oloc;
     object_t target_oid;
@@ -1077,7 +1074,32 @@ public:
     pg_t pgid;           ///< last pg we mapped to
     vector<int> acting;  ///< acting for last pg we mapped to
     int primary;         ///< primary for last pg we mapped to
+
     bool used_replica;
+    bool paused;
+
+    int osd;      ///< the final target osd, or -1
+
+    op_target_t(object_t oid, object_locator_t oloc, int flags)
+      : flags(flags),
+	base_oid(oid),
+	base_oloc(oloc),
+	precalc_pgid(false),
+	primary(-1),
+	used_replica(false),
+	paused(false),
+	osd(-1)
+    {}
+
+    void dump(Formatter *f) const;
+  };
+
+  struct Op {
+    OSDSession *session;
+    xlist<Op*>::item session_item;
+    int incarnation;
+
+    op_target_t target;
 
     ConnectionRef con;  // for rx buffer only
 
@@ -1092,15 +1114,13 @@ public:
     vector<Context*> out_handler;
     vector<int*> out_rval;
 
-    int flags, priority;
+    int priority;
     Context *onack, *oncommit, *ontimeout;
 
     ceph_tid_t tid;
     eversion_t replay_version;        // for op replay
     int attempts;
 
-    bool paused;
-
     version_t *objver;
     epoch_t *reply_epoch;
 
@@ -1116,16 +1136,14 @@ public:
     Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
        int f, Context *ac, Context *co, version_t *ov) :
       session(NULL), session_item(this), incarnation(0),
-      base_oid(o), base_oloc(ol),
-      precalc_pgid(false),
-      primary(-1),
-      used_replica(false), con(NULL),
+      target(o, ol, f),
+      con(NULL),
       snapid(CEPH_NOSNAP),
       outbl(NULL),
-      flags(f), priority(0), onack(ac), oncommit(co),
+      priority(0), onack(ac), oncommit(co),
       ontimeout(NULL),
       tid(0), attempts(0),
-      paused(false), objver(ov), reply_epoch(NULL),
+      objver(ov), reply_epoch(NULL),
       map_dne_bound(0),
       budgeted(false),
       should_resend(true) {
@@ -1141,8 +1159,8 @@ public:
 	out_rval[i] = NULL;
       }
 
-      if (base_oloc.key == o)
-	base_oloc.key.clear();
+      if (target.base_oloc.key == o)
+	target.base_oloc.key.clear();
     }
     ~Op() {
       while (!out_handler.empty()) {
@@ -1334,18 +1352,13 @@ public:
 
   struct LingerOp : public RefCountedObject {
     uint64_t linger_id;
-    object_t oid;
-    object_locator_t oloc;
 
-    pg_t pgid;
-    vector<int> acting;
-    int primary;
+    op_target_t target;
 
     snapid_t snap;
     SnapContext snapc;
     utime_t mtime;
 
-    int flags;
     vector<OSDOp> ops;
     bufferlist inbl;
     bufferlist *poutbl;
@@ -1360,8 +1373,9 @@ public:
     ceph_tid_t register_tid;
     epoch_t map_dne_bound;
 
-    LingerOp() : linger_id(0), primary(-1),
-		 snap(CEPH_NOSNAP), flags(0),
+    LingerOp() : linger_id(0),
+		 target(object_t(), object_locator_t(), 0),
+		 snap(CEPH_NOSNAP),
 		 poutbl(NULL), pobjver(NULL),
 		 registered(false),
 		 on_reg_ack(NULL), on_reg_commit(NULL),
@@ -1464,7 +1478,9 @@ public:
     RECALC_OP_TARGET_OSD_DOWN,
   };
   bool osdmap_full_flag() const;
-  bool op_should_be_paused(Op *op);
+  bool target_should_be_paused(op_target_t *op);
+
+  int calc_target(op_target_t *t);
   int recalc_op_target(Op *op);
   bool recalc_linger_op_target(LingerOp *op);
 
@@ -1695,8 +1711,8 @@ public:
     Op *o = new Op(object_t(), oloc,
 		   op.ops, flags | global_op_flags | CEPH_OSD_FLAG_READ,
 		   onack, NULL, NULL);
-    o->precalc_pgid = true;
-    o->base_pgid = pg_t(hash, oloc.pool);
+    o->target.precalc_pgid = true;
+    o->target.base_pgid = pg_t(hash, oloc.pool);
     o->priority = op.priority;
     o->snapid = CEPH_NOSNAP;
     o->outbl = pbl;
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index 098e200..e5da077 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -383,7 +383,7 @@ Rados object in state %s." % (self.state))
       Ping a monitor to assess liveness
 
       May be used as a simply way to assess liveness, or to obtain
-      informations about the monitor in a simple way even in the
+      information about the monitor in a simple way even in the
       absence of quorum.
 
       :param mon_id: the ID portion of the monitor's name (i.e., mon.<ID>)
@@ -1205,29 +1205,26 @@ class Ioctx(object):
         :type offset: int
 
         :raises: :class:`TypeError`
-        :raises: :class:`IncompleteWriteError`
         :raises: :class:`LogicError`
         :returns: int - number of bytes written 
         """
         self.require_ioctx_open()
+        if not isinstance(key, str):
+            raise TypeError('key must be a string')
         if not isinstance(data, str):
             raise TypeError('data must be a string')
         length = len(data)
         ret = run_in_thread(self.librados.rados_write,
                             (self.io, c_char_p(key), c_char_p(data),
                             c_size_t(length), c_uint64(offset)))
-        if ret == length:
+        if ret == 0:
             return ret
         elif ret < 0:
             raise make_ex(ret, "Ioctx.write(%s): failed to write %s" % \
                 (self.name, key))
-        elif ret < length:
-            raise IncompleteWriteError("Wrote only %d out of %d bytes" % \
-                (ret, length))
         else:
             raise LogicError("Ioctx.write(%s): rados_write \
-returned %d, but %d was the maximum number of bytes it could have \
-written." % (self.name, ret, length))
+returned %d, but should return zero on success." % (self.name, ret))
 
     def write_full(self, key, data):
         """
@@ -1256,9 +1253,43 @@ written." % (self.name, ret, length))
                             c_size_t(length)))
         if ret == 0:
             return ret
+        elif ret < 0:
+            raise make_ex(ret, "Ioctx.write_full(%s): failed to write %s" % \
+                (self.name, key))
         else:
-            raise make_ex(ret, "Ioctx.write(%s): failed to write_full %s" % \
+            raise LogicError("Ioctx.write_full(%s): rados_write_full \
+returned %d, but should return zero on success." % (self.name, ret))
+
+    def append(self, key, data):
+        """
+        Append data to an object synchronously
+
+        :param key: name of the object
+        :type key: str
+        :param data: data to write
+        :type data: str
+
+        :raises: :class:`TypeError`
+        :raises: :class:`LogicError`
+        :returns: int - number of bytes written
+        """
+        self.require_ioctx_open()
+        if not isinstance(key, str):
+            raise TypeError('key must be a string')
+        if not isinstance(data, str):
+            raise TypeError('data must be a string')
+        length = len(data)
+        ret = run_in_thread(self.librados.rados_append,
+                            (self.io, c_char_p(key), c_char_p(data),
+                            c_size_t(length)))
+        if ret == 0:
+            return ret
+        elif ret < 0:
+            raise make_ex(ret, "Ioctx.append(%s): failed to append %s" % \
                 (self.name, key))
+        else:
+            raise LogicError("Ioctx.append(%s): rados_append \
+returned %d, but should return zero on success." % (self.name, ret))
 
     def read(self, key, length=8192, offset=0):
         """
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index d1b7e96..ec83878 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -45,7 +45,7 @@ void _usage()
   cerr << "  user info                  get user info\n";
   cerr << "  user rm                    remove user\n";
   cerr << "  user suspend               suspend a user\n";
-  cerr << "  user enable                reenable user after suspension\n";
+  cerr << "  user enable                re-enable user after suspension\n";
   cerr << "  user check                 check user info\n";
   cerr << "  user stats                 show user stats as accounted by quota subsystem\n";
   cerr << "  caps add                   add user capabilities\n";
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index f620f9b..23575d8 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -740,8 +740,16 @@ int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
   std::string access_key = op_state.get_access_key();
   std::string secret_key = op_state.get_secret_key();
 
+  int32_t key_type = op_state.get_key_type();
+
+  // if a key type wasn't specified set it to s3
+  if (key_type < 0)
+    key_type = KEY_TYPE_S3;
+
+  op_state.set_key_type(key_type);
+
   /* see if the access key or secret key was specified */
-  if (!op_state.will_gen_access() && access_key.empty()) {
+  if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() && access_key.empty()) {
     set_err_msg(err_msg, "empty access key");
     return -EINVAL;
   }
@@ -750,10 +758,6 @@ int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
 
   check_existing_key(op_state);
 
-  // if a key type wasn't specified set it to s3
-  if (op_state.get_key_type() < 0)
-    op_state.set_key_type(KEY_TYPE_S3);
-
   return 0;
 }
 
@@ -874,7 +878,7 @@ int RGWAccessKeyPool::generate_key(RGWUserAdminOpState& op_state, std::string *e
 // modify an existing key
 int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg)
 {
-  std::string id = op_state.get_access_key();
+  std::string id;
   std::string key = op_state.get_secret_key();
   int key_type = op_state.get_key_type();
 
@@ -883,8 +887,23 @@ int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err
   pair<string, RGWAccessKey> key_pair;
   map<std::string, RGWAccessKey>::iterator kiter;
 
-  if (id.empty()) {
-    set_err_msg(err_msg, "no access key specified");
+  switch (key_type) {
+  case KEY_TYPE_S3:
+    id = op_state.get_access_key();
+    if (id.empty()) {
+      set_err_msg(err_msg, "no access key specified");
+      return -EINVAL;
+    }
+    break;
+  case KEY_TYPE_SWIFT:
+    id = op_state.build_default_swift_kid();
+    if (id.empty()) {
+      set_err_msg(err_msg, "no subuser specified");
+      return -EINVAL;
+    }
+    break;
+  default:
+    set_err_msg(err_msg, "invalid key type");
     return -EINVAL;
   }
 
@@ -896,14 +915,13 @@ int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err
   key_pair.first = id;
 
   if (key_type == KEY_TYPE_SWIFT) {
-    kiter = swift_keys->find(id);
-    modify_key = kiter->second;
+    modify_key.id = id;
+    modify_key.subuser = op_state.get_subuser();
   } else if (key_type == KEY_TYPE_S3) {
     kiter = access_keys->find(id);
-    modify_key = kiter->second;
-  } else {
-    set_err_msg(err_msg, "invalid key type");
-    return -EINVAL;
+    if (kiter != access_keys->end()) {
+      modify_key = kiter->second;
+    }
   }
 
   if (op_state.will_gen_secret()) {
@@ -961,8 +979,10 @@ int RGWAccessKeyPool::execute_add(RGWUserAdminOpState& op_state,
     break;
   }
 
-  if (ret < 0)
+  if (ret < 0) {
+    set_err_msg(err_msg, subprocess_msg);
     return ret;
+  }
 
   // store the updated info
   if (!defer_user_update)
@@ -1163,12 +1183,6 @@ int RGWSubUserPool::execute_add(RGWUserAdminOpState& op_state,
 
   subuser_pair.first = subuser_str;
 
-  // no duplicates
-  if (op_state.has_existing_subuser()) {
-    set_err_msg(err_msg, "subuser exists");
-    return -EEXIST;
-  }
-
   // assumes key should be created
   if (op_state.has_key_op()) {
     ret = user->keys.add(op_state, &subprocess_msg, true);
@@ -1214,6 +1228,10 @@ int RGWSubUserPool::add(RGWUserAdminOpState& op_state, std::string *err_msg, boo
     return ret;
   }
 
+  if (op_state.get_secret_key().empty()) {
+    op_state.set_gen_access();
+  }
+
   ret = execute_add(op_state, &subprocess_msg, defer_user_update);
   if (ret < 0) {
     set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg);
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 3cd08d3..af99ed5 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -233,7 +233,6 @@ struct RGWUserAdminOpState {
     }
 
     subuser_specified = true;
-    gen_access = true;
   }
   void set_caps(std::string& _caps) {
     if (_caps.empty())
diff --git a/src/test/admin_socket.cc b/src/test/admin_socket.cc
index 8d58dfd..ab52cec 100644
--- a/src/test/admin_socket.cc
+++ b/src/test/admin_socket.cc
@@ -282,6 +282,7 @@ TEST(AdminSocket, bind_and_listen) {
     int fd = 0;
     string message;
     message = asoct.bind_and_listen(path, &fd);
+    std::cout << "message: " << message << std::endl;
     EXPECT_NE(std::string::npos, message.find("File exists"));
     ASSERT_TRUE(asoct.shutdown());
   }
diff --git a/src/test/cli/monmaptool/print-nonexistent.t b/src/test/cli/monmaptool/print-nonexistent.t
index 7c5d746..ae366c1 100644
--- a/src/test/cli/monmaptool/print-nonexistent.t
+++ b/src/test/cli/monmaptool/print-nonexistent.t
@@ -1,4 +1,4 @@
   $ monmaptool --print nonexistent
   monmaptool: monmap file nonexistent
-  monmaptool: couldn't open nonexistent: No such file or directory
+  monmaptool: couldn't open nonexistent: (2) No such file or directory
   [255]
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index ca3e747..a46d240 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -6,7 +6,7 @@
     user info                  get user info
     user rm                    remove user
     user suspend               suspend a user
-    user enable                reenable user after suspension
+    user enable                re-enable user after suspension
     user check                 check user info
     user stats                 show user stats as accounted by quota subsystem
     caps add                   add user capabilities
diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc
index efa44fc..48cfb33 100644
--- a/src/test/cls_rbd/test_cls_rbd.cc
+++ b/src/test/cls_rbd/test_cls_rbd.cc
@@ -869,7 +869,7 @@ TEST(cls_rbd, snapid_race)
   bl.append(bp);
 
   string oid = "foo";
-  ASSERT_EQ(4096, ioctx.write(oid, bl, 4096, 0));
+  ASSERT_EQ(0, ioctx.write(oid, bl, 4096, 0));
   ASSERT_EQ(0, old_snapshot_add(&ioctx, oid, 1, "test1"));
   ASSERT_EQ(0, old_snapshot_add(&ioctx, oid, 3, "test3"));
   ASSERT_EQ(-ESTALE, old_snapshot_add(&ioctx, oid, 2, "test2"));
diff --git a/src/test/librados/TestCase.cc b/src/test/librados/TestCase.cc
index d9e5e49..9f68af1 100644
--- a/src/test/librados/TestCase.cc
+++ b/src/test/librados/TestCase.cc
@@ -94,6 +94,87 @@ void RadosTestPP::cleanup_default_namespace(librados::IoCtx ioctx)
   }
 }
 
+std::string RadosTestParamPP::pool_name;
+std::string RadosTestParamPP::cache_pool_name;
+Rados RadosTestParamPP::s_cluster;
+
+void RadosTestParamPP::SetUpTestCase()
+{
+  pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, s_cluster));
+}
+
+void RadosTestParamPP::TearDownTestCase()
+{
+  if (cache_pool_name.length()) {
+    // tear down tiers
+    bufferlist inbl;
+    ASSERT_EQ(0, s_cluster.mon_command(
+      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+      "\"}",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, s_cluster.mon_command(
+      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, s_cluster.mon_command(
+      "{\"prefix\": \"osd pool delete\", \"pool\": \"" + cache_pool_name +
+      "\", \"pool2\": \"" + cache_pool_name + "\", \"sure\": \"--yes-i-really-really-mean-it\"}",
+      inbl, NULL, NULL));
+    cache_pool_name = "";
+  }
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, s_cluster));
+}
+
+void RadosTestParamPP::SetUp()
+{
+  if (strcmp(GetParam(), "cache") == 0 && cache_pool_name.empty()) {
+    cache_pool_name = get_temp_pool_name();
+    bufferlist inbl;
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd pool create\", \"pool\": \"" + cache_pool_name +
+      "\", \"pg_num\": 4}",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+      "\", \"tierpool\": \"" + cache_pool_name +
+      "\", \"force_nonempty\": \"--force-nonempty\" }",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+      "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+      "\", \"mode\": \"writeback\"}",
+      inbl, NULL, NULL));
+    cluster.wait_for_latest_osdmap();
+  }
+
+  ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
+  ns = get_temp_pool_name();
+  ioctx.set_namespace(ns);
+  ASSERT_FALSE(ioctx.pool_requires_alignment());
+}
+
+void RadosTestParamPP::TearDown()
+{
+  cleanup_default_namespace(ioctx);
+  ioctx.close();
+}
+
+void RadosTestParamPP::cleanup_default_namespace(librados::IoCtx ioctx)
+{
+  // remove all objects from the default namespace to avoid polluting
+  // other tests
+  ioctx.set_namespace("");
+  for (ObjectIterator it = ioctx.objects_begin();
+       it != ioctx.objects_end(); ++it) {
+    ioctx.locator_set_key(it->second);
+    ASSERT_EQ(0, ioctx.remove(it->first));
+  }
+}
+
 std::string RadosTestEC::pool_name;
 rados_t RadosTestEC::s_cluster = NULL;
 
diff --git a/src/test/librados/TestCase.h b/src/test/librados/TestCase.h
index ccc0359..5bd084f 100644
--- a/src/test/librados/TestCase.h
+++ b/src/test/librados/TestCase.h
@@ -53,6 +53,25 @@ protected:
   std::string ns;
 };
 
+class RadosTestParamPP : public ::testing::TestWithParam<const char*> {
+public:
+  RadosTestParamPP() : cluster(s_cluster) {}
+  virtual ~RadosTestParamPP() {}
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+protected:
+  static void cleanup_default_namespace(librados::IoCtx ioctx);
+  static librados::Rados s_cluster;
+  static std::string pool_name;
+  static std::string cache_pool_name;
+
+  virtual void SetUp();
+  virtual void TearDown();
+  librados::Rados &cluster;
+  librados::IoCtx ioctx;
+  std::string ns;
+};
+
 class RadosTestEC : public ::testing::Test {
 public:
   RadosTestEC() {}
diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc
index bb83f3e..218da3b 100644
--- a/src/test/librados/aio.cc
+++ b/src/test/librados/aio.cc
@@ -147,6 +147,20 @@ void set_completion_safe(rados_completion_t cb, void *arg)
   sem_post(&test->m_sem);
 }
 
+void set_completion_completePP(rados_completion_t cb, void *arg)
+{
+  AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
+  test->m_complete = true;
+  sem_post(&test->m_sem);
+}
+
+void set_completion_safePP(rados_completion_t cb, void *arg)
+{
+  AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
+  test->m_safe = true;
+  sem_post(&test->m_sem);
+}
+
 TEST(LibRadosAio, SimpleWrite) {
   AioTestData test_data;
   rados_completion_t my_completion;
@@ -162,6 +176,7 @@ TEST(LibRadosAio, SimpleWrite) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
@@ -173,6 +188,7 @@ TEST(LibRadosAio, SimpleWrite) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_aio_release(my_completion);
 }
 
@@ -185,7 +201,7 @@ TEST(LibRadosAio, SimpleWritePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
@@ -195,6 +211,7 @@ TEST(LibRadosAio, SimpleWritePP) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
   }
 
@@ -203,7 +220,7 @@ TEST(LibRadosAio, SimpleWritePP) {
   ASSERT_EQ("", test_data.init());
   test_data.m_ioctx.set_namespace("nspace");
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
 			       my_completion, bl1, sizeof(buf), 0));
   {
@@ -211,6 +228,7 @@ TEST(LibRadosAio, SimpleWritePP) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
   }
 }
@@ -227,6 +245,7 @@ TEST(LibRadosAio, WaitForSafe) {
 			       my_completion, buf, sizeof(buf), 0));
   TestAlarm alarm;
   ASSERT_EQ(0, rados_aio_wait_for_safe(my_completion));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_aio_release(my_completion);
 }
 
@@ -234,7 +253,7 @@ TEST(LibRadosAio, WaitForSafePP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -245,6 +264,7 @@ TEST(LibRadosAio, WaitForSafePP) {
 			       my_completion, bl1, sizeof(buf), 0));
   TestAlarm alarm;
   ASSERT_EQ(0, my_completion->wait_for_safe());
+  ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
 }
 
@@ -263,7 +283,8 @@ TEST(LibRadosAio, RoundTrip) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
-  char buf2[128];
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[256];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
@@ -274,6 +295,7 @@ TEST(LibRadosAio, RoundTrip) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -294,6 +316,7 @@ TEST(LibRadosAio, RoundTrip2) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
@@ -303,8 +326,9 @@ TEST(LibRadosAio, RoundTrip2) {
 			      my_completion2, buf2, sizeof(buf2), 0));
   {
     TestAlarm alarm;
-    ASSERT_EQ(0, rados_aio_wait_for_safe(my_completion2));
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -314,7 +338,7 @@ TEST(LibRadosAio, RoundTripPP) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -328,9 +352,10 @@ TEST(LibRadosAio, RoundTripPP) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion2, &bl2, sizeof(buf), 0));
@@ -338,6 +363,8 @@ TEST(LibRadosAio, RoundTripPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
   delete my_completion;
   delete my_completion2;
@@ -347,7 +374,7 @@ TEST(LibRadosAio, RoundTripPP2) {
   AioTestDataPP test_data;
   ASSERT_EQ("", test_data.init());
   AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   AioCompletion *my_completion_null = NULL;
   ASSERT_NE(my_completion, my_completion_null);
   char buf[128];
@@ -361,16 +388,20 @@ TEST(LibRadosAio, RoundTripPP2) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-	  (void*)&test_data, set_completion_complete, set_completion_safe);
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
   ASSERT_NE(my_completion2, my_completion_null);
   ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
 			      my_completion2, &bl2, sizeof(buf), 0));
   {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_safe());
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
   delete my_completion;
   delete my_completion2;
@@ -390,16 +421,18 @@ TEST(LibRadosAio, RoundTripAppend) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0xdd, sizeof(buf2));
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
 	      set_completion_complete, set_completion_safe, &my_completion2));
   ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
-			       my_completion2, buf2, sizeof(buf)));
+			       my_completion2, buf2, sizeof(buf2)));
   {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   char buf3[sizeof(buf) + sizeof(buf2)];
   memset(buf3, 0, sizeof(buf3));
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
@@ -410,6 +443,7 @@ TEST(LibRadosAio, RoundTripAppend) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
   }
+  ASSERT_EQ((int)sizeof(buf3), rados_aio_get_return_value(my_completion3));
   ASSERT_EQ(0, memcmp(buf3, buf, sizeof(buf)));
   ASSERT_EQ(0, memcmp(buf3 + sizeof(buf), buf2, sizeof(buf2)));
   rados_aio_release(my_completion);
@@ -434,6 +468,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   char buf2[128];
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
@@ -447,6 +482,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion2->get_return_value());
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
@@ -457,6 +493,8 @@ TEST(LibRadosAio, RoundTripAppendPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion3->wait_for_complete());
   }
+  ASSERT_EQ((int)(sizeof(buf) * 2), my_completion3->get_return_value());
+  ASSERT_EQ(sizeof(buf) * 2, bl3.length());
   ASSERT_EQ(0, memcmp(bl3.c_str(), buf, sizeof(buf)));
   ASSERT_EQ(0, memcmp(bl3.c_str() + sizeof(buf), buf2, sizeof(buf2)));
   delete my_completion;
@@ -479,6 +517,7 @@ TEST(LibRadosAio, IsComplete) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
@@ -497,6 +536,7 @@ TEST(LibRadosAio, IsComplete) {
 	break;
     }
   }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -520,6 +560,7 @@ TEST(LibRadosAio, IsCompletePP) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
@@ -530,13 +571,15 @@ TEST(LibRadosAio, IsCompletePP) {
     TestAlarm alarm;
 
     // Busy-wait until the AIO completes.
-    // Normally we wouldn't do this, but we want to test rados_aio_is_complete.
+    // Normally we wouldn't do this, but we want to test is_complete.
     while (true) {
       int is_complete = my_completion2->is_complete();
       if (is_complete)
 	break;
     }
   }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
   delete my_completion;
   delete my_completion2;
@@ -563,6 +606,7 @@ TEST(LibRadosAio, IsSafe) {
 	break;
     }
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
@@ -574,6 +618,7 @@ TEST(LibRadosAio, IsSafe) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -603,6 +648,7 @@ TEST(LibRadosAio, IsSafePP) {
 	break;
     }
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
   bufferlist bl2;
@@ -613,6 +659,8 @@ TEST(LibRadosAio, IsSafePP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
   delete my_completion;
   delete my_completion2;
@@ -665,6 +713,7 @@ TEST(LibRadosAio, Flush) {
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
 			       my_completion, buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_aio_flush(test_data.m_ioctx));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
@@ -676,6 +725,7 @@ TEST(LibRadosAio, Flush) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ((int)sizeof(buf2), rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -695,6 +745,7 @@ TEST(LibRadosAio, FlushPP) {
   ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
 					   bl1, sizeof(buf), 0));
   ASSERT_EQ(0, test_data.m_ioctx.aio_flush());
+  ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
@@ -705,6 +756,8 @@ TEST(LibRadosAio, FlushPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
   delete my_completion;
   delete my_completion2;
@@ -732,6 +785,7 @@ TEST(LibRadosAio, FlushAsync) {
   ASSERT_EQ(1, rados_aio_is_safe(my_completion));
   ASSERT_EQ(1, rados_aio_is_complete(flush_completion));
   ASSERT_EQ(1, rados_aio_is_safe(flush_completion));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
   rados_completion_t my_completion2;
@@ -743,6 +797,7 @@ TEST(LibRadosAio, FlushAsync) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ((int)sizeof(buf2), rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -774,6 +829,7 @@ TEST(LibRadosAio, FlushAsyncPP) {
   ASSERT_EQ(1, my_completion->is_safe());
   ASSERT_EQ(1, flush_completion->is_complete());
   ASSERT_EQ(1, flush_completion->is_safe());
+  ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
@@ -784,6 +840,8 @@ TEST(LibRadosAio, FlushAsyncPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
   delete my_completion;
   delete my_completion2;
@@ -804,6 +862,7 @@ TEST(LibRadosAio, RoundTripWriteFull) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[64];
   memset(buf2, 0xdd, sizeof(buf2));
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
@@ -814,6 +873,7 @@ TEST(LibRadosAio, RoundTripWriteFull) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   char buf3[sizeof(buf) + sizeof(buf2)];
   memset(buf3, 0, sizeof(buf3));
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
@@ -824,6 +884,7 @@ TEST(LibRadosAio, RoundTripWriteFull) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
   }
+  ASSERT_EQ((int)sizeof(buf2), rados_aio_get_return_value(my_completion3));
   ASSERT_EQ(0, memcmp(buf3, buf2, sizeof(buf2)));
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -847,6 +908,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   char buf2[64];
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
@@ -859,6 +921,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion2->get_return_value());
   bufferlist bl3;
   AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
@@ -869,6 +932,8 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion3->wait_for_complete());
   }
+  ASSERT_EQ((int)sizeof(buf2), my_completion3->get_return_value());
+  ASSERT_EQ(sizeof(buf2), bl3.length());
   ASSERT_EQ(0, memcmp(bl3.c_str(), buf2, sizeof(buf2)));
   delete my_completion;
   delete my_completion2;
@@ -891,6 +956,7 @@ TEST(LibRadosAio, SimpleStat) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
@@ -902,6 +968,7 @@ TEST(LibRadosAio, SimpleStat) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(sizeof(buf), psize);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
@@ -925,6 +992,7 @@ TEST(LibRadosAio, SimpleStatPP) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
@@ -936,6 +1004,7 @@ TEST(LibRadosAio, SimpleStatPP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion2->get_return_value());
   ASSERT_EQ(sizeof(buf), psize);
   delete my_completion;
   delete my_completion2;
@@ -956,6 +1025,7 @@ TEST(LibRadosAio, SimpleStatNS) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
   char buf2[64];
   memset(buf2, 0xbb, sizeof(buf2));
@@ -968,6 +1038,7 @@ TEST(LibRadosAio, SimpleStatNS) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
@@ -980,6 +1051,7 @@ TEST(LibRadosAio, SimpleStatNS) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(sizeof(buf), psize);
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
@@ -992,6 +1064,7 @@ TEST(LibRadosAio, SimpleStatNS) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion3));
   ASSERT_EQ(sizeof(buf2), psize);
 
   rados_aio_release(my_completion);
@@ -1017,6 +1090,7 @@ TEST(LibRadosAio, SimpleStatPPNS) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
@@ -1028,6 +1102,7 @@ TEST(LibRadosAio, SimpleStatPPNS) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion2->get_return_value());
   ASSERT_EQ(sizeof(buf), psize);
   delete my_completion;
   delete my_completion2;
@@ -1048,6 +1123,7 @@ TEST(LibRadosAio, StatRemove) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
   time_t pmtime;
   rados_completion_t my_completion2;
@@ -1059,6 +1135,7 @@ TEST(LibRadosAio, StatRemove) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   ASSERT_EQ(sizeof(buf), psize);
   rados_completion_t my_completion3;
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
@@ -1068,6 +1145,7 @@ TEST(LibRadosAio, StatRemove) {
     TestAlarm alarm;
     ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
   }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion3));
   uint64_t psize2;
   time_t pmtime2;
   rados_completion_t my_completion4;
@@ -1104,6 +1182,7 @@ TEST(LibRadosAio, StatRemovePP) {
     sem_wait(&test_data.m_sem);
     sem_wait(&test_data.m_sem);
   }
+  ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
   time_t pmtime;
   AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
@@ -1115,6 +1194,7 @@ TEST(LibRadosAio, StatRemovePP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion2->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion2->get_return_value());
   ASSERT_EQ(sizeof(buf), psize);
   uint64_t psize2;
   time_t pmtime2;
@@ -1126,6 +1206,7 @@ TEST(LibRadosAio, StatRemovePP) {
     TestAlarm alarm;
     ASSERT_EQ(0, my_completion3->wait_for_complete());
   }
+  ASSERT_EQ(0, my_completion3->get_return_value());
 
   AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
 	  (void*)&test_data, set_completion_complete, set_completion_safe);
@@ -1174,6 +1255,7 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(0, my_completion->get_return_value());
   }
 
   {
@@ -1192,6 +1274,7 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(-ECANCELED, my_completion->get_return_value());
     ASSERT_EQ(-ECANCELED, r);
   }
 
@@ -1225,6 +1308,7 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(0, my_completion->get_return_value());
 
     ASSERT_EQ(header.length(), header_to_set.length());
     ASSERT_EQ(set_got.size(), (unsigned)1);
@@ -1249,6 +1333,7 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(0, my_completion->get_return_value());
   }
 
   {
@@ -1262,6 +1347,7 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(0, my_completion->get_return_value());
     ASSERT_EQ(set_got.size(), (unsigned)2);
   }
 
@@ -1274,6 +1360,7 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(0, my_completion->get_return_value());
   }
 
   {
@@ -1287,9 +1374,1492 @@ TEST(LibRadosAio, OmapPP) {
       TestAlarm alarm;
       ASSERT_EQ(0, my_completion->wait_for_complete());
     }
+    EXPECT_EQ(0, my_completion->get_return_value());
     ASSERT_EQ(set_got.size(), (unsigned)0);
   }
 
   ioctx.remove("test_obj");
   destroy_one_pool_pp(pool_name, cluster);
 }
+
+TEST(LibRadosAio, MultiWrite) {
+  AioTestData test_data;
+  rados_completion_t my_completion, my_completion2, my_completion3;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_complete, set_completion_safe, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+
+  char buf2[64];
+  memset(buf2, 0xdd, sizeof(buf2));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion2, buf2, sizeof(buf2), sizeof(buf)));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
+
+  char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
+  memset(buf3, 0, sizeof(buf3));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion3, buf3, sizeof(buf3), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
+  }
+  ASSERT_EQ((int)(sizeof(buf) + sizeof(buf2)), rados_aio_get_return_value(my_completion3));
+  ASSERT_EQ(0, memcmp(buf3, buf, sizeof(buf)));
+  ASSERT_EQ(0, memcmp(buf3 + sizeof(buf), buf2, sizeof(buf2)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(my_completion3);
+}
+
+TEST(LibRadosAio, MultiWritePP) {
+  AioTestDataPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_complete, set_completion_safe);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+
+  char buf2[64];
+  memset(buf2, 0xdd, sizeof(buf2));
+  bufferlist bl2;
+  bl2.append(buf2, sizeof(buf2));
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_complete, set_completion_safe);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
+					   bl2, sizeof(buf2), sizeof(buf)));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+
+  bufferlist bl3;
+  AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_complete, set_completion_safe);
+  ASSERT_NE(my_completion3, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
+					  &bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion3->wait_for_complete());
+  }
+  ASSERT_EQ((int)(sizeof(buf) + sizeof(buf2)), my_completion3->get_return_value());
+  ASSERT_EQ(sizeof(buf) + sizeof(buf2), bl3.length());
+  ASSERT_EQ(0, memcmp(bl3.c_str(), buf, sizeof(buf)));
+  ASSERT_EQ(0, memcmp(bl3.c_str() + sizeof(buf), buf2, sizeof(buf2)));
+  delete my_completion;
+  delete my_completion2;
+  delete my_completion3;
+}
+
+// EC test cases
+class AioTestDataEC
+{
+public:
+  AioTestDataEC()
+    : m_cluster(NULL),
+      m_ioctx(NULL),
+      m_init(false),
+      m_complete(false),
+      m_safe(false)
+  {
+  }
+
+  ~AioTestDataEC()
+  {
+    if (m_init) {
+      rados_ioctx_destroy(m_ioctx);
+      destroy_one_ec_pool(m_pool_name, &m_cluster);
+      sem_destroy(&m_sem);
+    }
+  }
+
+  std::string init()
+  {
+    int ret;
+    if (sem_init(&m_sem, 0, 0)) {
+      int err = errno;
+      sem_destroy(&m_sem);
+      ostringstream oss;
+      oss << "sem_init failed: " << cpp_strerror(err);
+      return oss.str();
+    }
+    m_pool_name = get_temp_pool_name();
+    std::string err = create_one_ec_pool(m_pool_name, &m_cluster);
+    if (!err.empty()) {
+      sem_destroy(&m_sem);
+      ostringstream oss;
+      oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
+      return oss.str();
+    }
+    ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
+    if (ret) {
+      sem_destroy(&m_sem);
+      destroy_one_ec_pool(m_pool_name, &m_cluster);
+      ostringstream oss;
+      oss << "rados_ioctx_create failed: error " << ret;
+      return oss.str();
+    }
+    m_init = true;
+    return "";
+  }
+
+  sem_t m_sem;
+  rados_t m_cluster;
+  rados_ioctx_t m_ioctx;
+  std::string m_pool_name;
+  bool m_init;
+  bool m_complete;
+  bool m_safe;
+};
+
+class AioTestDataECPP
+{
+public:
+  AioTestDataECPP()
+    : m_init(false),
+      m_complete(false),
+      m_safe(false)
+  {
+  }
+
+  ~AioTestDataECPP()
+  {
+    if (m_init) {
+      m_ioctx.close();
+      destroy_one_ec_pool_pp(m_pool_name, m_cluster);
+      sem_destroy(&m_sem);
+    }
+  }
+
+  std::string init()
+  {
+    int ret;
+    if (sem_init(&m_sem, 0, 0)) {
+      int err = errno;
+      sem_destroy(&m_sem);
+      ostringstream oss;
+      oss << "sem_init failed: " << cpp_strerror(err);
+      return oss.str();
+    }
+    m_pool_name = get_temp_pool_name();
+    std::string err = create_one_ec_pool_pp(m_pool_name, m_cluster);
+    if (!err.empty()) {
+      sem_destroy(&m_sem);
+      ostringstream oss;
+      oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
+      return oss.str();
+    }
+    ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
+    if (ret) {
+      sem_destroy(&m_sem);
+      destroy_one_ec_pool_pp(m_pool_name, m_cluster);
+      ostringstream oss;
+      oss << "rados_ioctx_create failed: error " << ret;
+      return oss.str();
+    }
+    m_init = true;
+    return "";
+  }
+
+  sem_t m_sem;
+  Rados m_cluster;
+  IoCtx m_ioctx;
+  std::string m_pool_name;
+  bool m_init;
+  bool m_complete;
+  bool m_safe;
+};
+
+void set_completion_completeEC(rados_completion_t cb, void *arg)
+{
+  AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
+  test->m_complete = true;
+  sem_post(&test->m_sem);
+}
+
+void set_completion_safeEC(rados_completion_t cb, void *arg)
+{
+  AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
+  test->m_safe = true;
+  sem_post(&test->m_sem);
+}
+
+void set_completion_completeECPP(rados_completion_t cb, void *arg)
+{
+  AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
+  test->m_complete = true;
+  sem_post(&test->m_sem);
+}
+
+void set_completion_safeECPP(rados_completion_t cb, void *arg)
+{
+  AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
+  test->m_safe = true;
+  sem_post(&test->m_sem);
+}
+
+TEST(LibRadosAioEC, SimpleWrite) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+
+  rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  rados_aio_release(my_completion);
+}
+
+TEST(LibRadosAioEC, SimpleWritePP) {
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
+			       my_completion, bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  delete my_completion;
+  }
+
+  {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  test_data.m_ioctx.set_namespace("nspace");
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
+			       my_completion, bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  delete my_completion;
+  }
+}
+
+TEST(LibRadosAioEC, WaitForSafe) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  TestAlarm alarm;
+  ASSERT_EQ(0, rados_aio_wait_for_safe(my_completion));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  rados_aio_release(my_completion);
+}
+
+TEST(LibRadosAioEC, WaitForSafePP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
+			       my_completion, bl1, sizeof(buf), 0));
+  TestAlarm alarm;
+  ASSERT_EQ(0, my_completion->wait_for_safe());
+  ASSERT_EQ(0, my_completion->get_return_value());
+  delete my_completion;
+}
+
+TEST(LibRadosAioEC, RoundTrip) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[256];
+  memset(buf2, 0, sizeof(buf2));
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion2, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+}
+
+TEST(LibRadosAioEC, RoundTrip2) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[128];
+  memset(buf2, 0, sizeof(buf2));
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion2, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+}
+
+TEST(LibRadosAioEC, RoundTripPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
+			      my_completion2, &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
+  ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, RoundTripPP2) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
+			      my_completion2, &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_safe());
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
+  ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, RoundTripAppend) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion, my_completion2, my_completion3, my_completion4;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_TRUE(rados_ioctx_pool_requires_alignment(test_data.m_ioctx));
+  uint64_t alignment = rados_ioctx_pool_required_alignment(test_data.m_ioctx);
+  ASSERT_NE((unsigned)0, alignment);
+
+  int bsize = alignment;
+  char *buf = (char *)new char[bsize];
+  memset(buf, 0xcc, bsize);
+  ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
+			       my_completion, buf, bsize));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+
+  int hbsize = bsize / 2;
+  char *buf2 = (char *)new char[hbsize];
+  memset(buf2, 0xdd, hbsize);
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
+			       my_completion2, buf2, hbsize));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
+
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
+			       my_completion3, buf2, hbsize));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
+  }
+  EXPECT_EQ(-EOPNOTSUPP, rados_aio_get_return_value(my_completion3));
+
+  int tbsize = bsize + hbsize;
+  char *buf3 = (char *)new char[tbsize];
+  memset(buf3, 0, tbsize);
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion4));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion4, buf3, bsize * 3, 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion4));
+  }
+  ASSERT_EQ(tbsize, rados_aio_get_return_value(my_completion4));
+  ASSERT_EQ(0, memcmp(buf3, buf, bsize));
+  ASSERT_EQ(0, memcmp(buf3 + bsize, buf2, hbsize));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(my_completion3);
+  delete[] buf;
+  delete[] buf2;
+  delete[] buf3;
+}
+
+TEST(LibRadosAioEC, RoundTripAppendPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  ASSERT_TRUE(test_data.m_ioctx.pool_requires_alignment());
+  uint64_t alignment = test_data.m_ioctx.pool_required_alignment();
+  ASSERT_NE((unsigned)0, alignment);
+  int bsize = alignment;
+  char *buf = (char *)new char[bsize];
+  memset(buf, 0xcc, bsize);
+  bufferlist bl1;
+  bl1.append(buf, bsize);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion,
+					    bl1, bsize));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+
+  int hbsize = bsize / 2;
+  char *buf2 = (char *)new char[hbsize];
+  memset(buf2, 0xdd, hbsize);
+  bufferlist bl2;
+  bl2.append(buf2, hbsize);
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
+					    bl2, hbsize));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+
+  AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion3, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion3,
+					    bl2, hbsize));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion3->wait_for_complete());
+  }
+  EXPECT_EQ(-EOPNOTSUPP, my_completion3->get_return_value());
+
+  bufferlist bl3;
+  AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion4, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
+			      my_completion4, &bl3, bsize * 3, 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion4->wait_for_complete());
+  }
+  int tbsize = bsize + hbsize;
+  ASSERT_EQ(tbsize, my_completion4->get_return_value());
+  ASSERT_EQ((unsigned)tbsize, bl3.length());
+  ASSERT_EQ(0, memcmp(bl3.c_str(), buf, bsize));
+  ASSERT_EQ(0, memcmp(bl3.c_str() + bsize, buf2, hbsize));
+  delete my_completion;
+  delete my_completion2;
+  delete my_completion3;
+  delete[] buf;
+  delete[] buf2;
+}
+
+TEST(LibRadosAioEC, IsComplete) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[128];
+  memset(buf2, 0, sizeof(buf2));
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion2, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+
+    // Busy-wait until the AIO completes.
+    // Normally we wouldn't do this, but we want to test rados_aio_is_complete.
+    while (true) {
+      int is_complete = rados_aio_is_complete(my_completion2);
+      if (is_complete)
+	break;
+    }
+  }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+}
+
+TEST(LibRadosAioEC, IsCompletePP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
+					  &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+
+    // Busy-wait until the AIO completes.
+    // Normally we wouldn't do this, but we want to test is_complete.
+    while (true) {
+      int is_complete = my_completion2->is_complete();
+      if (is_complete)
+	break;
+    }
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
+  ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, IsSafe) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+
+    // Busy-wait until the AIO completes.
+    // Normally we wouldn't do this, but we want to test rados_aio_is_safe.
+    while (true) {
+      int is_safe = rados_aio_is_safe(my_completion);
+      if (is_safe)
+	break;
+    }
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[128];
+  memset(buf2, 0, sizeof(buf2));
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion2, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+}
+
+TEST(LibRadosAioEC, IsSafePP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+
+    // Busy-wait until the AIO completes.
+    // Normally we wouldn't do this, but we want to test rados_aio_is_safe.
+    while (true) {
+      int is_safe = my_completion->is_safe();
+      if (is_safe)
+	break;
+    }
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  bufferlist bl2;
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
+					  &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
+  ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, ReturnValue) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
+  }
+  ASSERT_EQ(-ENOENT, rados_aio_get_return_value(my_completion));
+  rados_aio_release(my_completion);
+}
+
+TEST(LibRadosAioEC, ReturnValuePP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  bufferlist bl1;
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("nonexistent",
+			       my_completion, &bl1, 128, 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion->wait_for_complete());
+  }
+  ASSERT_EQ(-ENOENT, my_completion->get_return_value());
+  delete my_completion;
+}
+
+TEST(LibRadosAioEC, Flush) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xee, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_aio_flush(test_data.m_ioctx));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[128];
+  memset(buf2, 0, sizeof(buf2));
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion2, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ((int)sizeof(buf2), rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+}
+
+TEST(LibRadosAioEC, FlushPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xee, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_flush());
+  ASSERT_EQ(0, my_completion->get_return_value());
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
+					  &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
+  ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, FlushAsync) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  rados_completion_t flush_completion;
+  ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
+  char buf[128];
+  memset(buf, 0xee, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_aio_flush_async(test_data.m_ioctx, flush_completion));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(flush_completion));
+    ASSERT_EQ(0, rados_aio_wait_for_safe(flush_completion));
+  }
+  ASSERT_EQ(1, rados_aio_is_complete(my_completion));
+  ASSERT_EQ(1, rados_aio_is_safe(my_completion));
+  ASSERT_EQ(1, rados_aio_is_complete(flush_completion));
+  ASSERT_EQ(1, rados_aio_is_safe(flush_completion));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[128];
+  memset(buf2, 0, sizeof(buf2));
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion2, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ((int)sizeof(buf2), rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(flush_completion);
+}
+
+TEST(LibRadosAioEC, FlushAsyncPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *flush_completion =
+      test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xee, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_flush_async(flush_completion));
+  {
+      TestAlarm alarm;
+      ASSERT_EQ(0, flush_completion->wait_for_complete());
+      ASSERT_EQ(0, flush_completion->wait_for_safe());
+  }
+  ASSERT_EQ(1, my_completion->is_complete());
+  ASSERT_EQ(1, my_completion->is_safe());
+  ASSERT_EQ(1, flush_completion->is_complete());
+  ASSERT_EQ(1, flush_completion->is_safe());
+  ASSERT_EQ(0, my_completion->get_return_value());
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
+					  &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl2.length());
+  ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+  delete flush_completion;
+}
+
+TEST(LibRadosAioEC, RoundTripWriteFull) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion, my_completion2, my_completion3;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  char buf2[64];
+  memset(buf2, 0xdd, sizeof(buf2));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
+			       my_completion2, buf2, sizeof(buf2)));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
+  char buf3[sizeof(buf) + sizeof(buf2)];
+  memset(buf3, 0, sizeof(buf3));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion3, buf3, sizeof(buf3), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
+  }
+  ASSERT_EQ((int)sizeof(buf2), rados_aio_get_return_value(my_completion3));
+  ASSERT_EQ(0, memcmp(buf3, buf2, sizeof(buf2)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(my_completion3);
+}
+
+TEST(LibRadosAioEC, RoundTripWriteFullPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  char buf2[64];
+  memset(buf2, 0xdd, sizeof(buf2));
+  bufferlist bl2;
+  bl2.append(buf2, sizeof(buf2));
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+  bufferlist bl3;
+  AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion3, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
+					  &bl3, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion3->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf2), my_completion3->get_return_value());
+  ASSERT_EQ(sizeof(buf2), bl3.length());
+  ASSERT_EQ(0, memcmp(bl3.c_str(), buf2, sizeof(buf2)));
+  delete my_completion;
+  delete my_completion2;
+  delete my_completion3;
+}
+
+
+TEST(LibRadosAioEC, SimpleStat) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  uint64_t psize;
+  time_t pmtime;
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
+			      my_completion2, &psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(sizeof(buf), psize);
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+}
+
+TEST(LibRadosAioEC, SimpleStatPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  uint64_t psize;
+  time_t pmtime;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
+			  		&psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), psize);
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, SimpleStatNS) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
+  char buf2[64];
+  memset(buf2, 0xbb, sizeof(buf2));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf2, sizeof(buf2), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  uint64_t psize;
+  time_t pmtime;
+  rados_completion_t my_completion2;
+  rados_ioctx_set_namespace(test_data.m_ioctx, "");
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
+			      my_completion2, &psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(sizeof(buf), psize);
+
+  rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
+  rados_completion_t my_completion3;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
+			      my_completion3, &psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion3));
+  ASSERT_EQ(sizeof(buf2), psize);
+
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(my_completion3);
+}
+
+TEST(LibRadosAioEC, SimpleStatPPNS) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  uint64_t psize;
+  time_t pmtime;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
+			  		&psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), psize);
+  delete my_completion;
+  delete my_completion2;
+}
+
+TEST(LibRadosAioEC, StatRemove) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  uint64_t psize;
+  time_t pmtime;
+  rados_completion_t my_completion2;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
+			      my_completion2, &psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
+  ASSERT_EQ(sizeof(buf), psize);
+  rados_completion_t my_completion3;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion3));
+  uint64_t psize2;
+  time_t pmtime2;
+  rados_completion_t my_completion4;
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion4));
+  ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
+			      my_completion4, &psize2, &pmtime2));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion4));
+  }
+  ASSERT_EQ(-ENOENT, rados_aio_get_return_value(my_completion4));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(my_completion3);
+  rados_aio_release(my_completion4);
+}
+
+TEST(LibRadosAioEC, StatRemovePP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(&test_data.m_sem);
+    sem_wait(&test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  uint64_t psize;
+  time_t pmtime;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
+			  		&psize, &pmtime));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+  ASSERT_EQ(sizeof(buf), psize);
+  uint64_t psize2;
+  time_t pmtime2;
+  AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion3, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion3->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion3->get_return_value());
+
+  AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion4, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
+			  		&psize2, &pmtime2));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion4->wait_for_complete());
+  }
+  ASSERT_EQ(-ENOENT, my_completion4->get_return_value());
+  delete my_completion;
+  delete my_completion2;
+  delete my_completion3;
+  delete my_completion4;
+}
+
+TEST(LibRadosAioEC, OmapPP) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_ec_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  string header_str = "baz";
+  bufferptr bp(header_str.c_str(), header_str.size() + 1);
+  bufferlist header_to_set;
+  header_to_set.push_back(bp);
+  map<string, bufferlist> to_set;
+  {
+    boost::scoped_ptr<AioCompletion> my_completion(cluster.aio_create_completion(0, 0, 0));
+    ObjectWriteOperation op;
+    to_set["foo"] = header_to_set;
+    to_set["foo2"] = header_to_set;
+    to_set["qfoo3"] = header_to_set;
+    op.omap_set(to_set);
+
+    op.omap_set_header(header_to_set);
+
+    ioctx.aio_operate("test_obj", my_completion.get(), &op);
+    {
+      TestAlarm alarm;
+      ASSERT_EQ(0, my_completion->wait_for_complete());
+    }
+    EXPECT_EQ(-EOPNOTSUPP, my_completion->get_return_value());
+  }
+  ioctx.remove("test_obj");
+  destroy_one_pool_pp(pool_name, cluster);
+}
+
+TEST(LibRadosAioEC, MultiWrite) {
+  AioTestDataEC test_data;
+  rados_completion_t my_completion, my_completion2, my_completion3;
+  ASSERT_EQ("", test_data.init());
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion, buf, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
+  }
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+
+  char buf2[64];
+  memset(buf2, 0xdd, sizeof(buf2));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
+			       my_completion2, buf2, sizeof(buf2), sizeof(buf)));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
+  }
+  ASSERT_EQ(-EOPNOTSUPP, rados_aio_get_return_value(my_completion2));
+
+  char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
+  memset(buf3, 0, sizeof(buf3));
+  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
+	      set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
+			      my_completion3, buf3, sizeof(buf3), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion3));
+  }
+  ASSERT_EQ((int)sizeof(buf), rados_aio_get_return_value(my_completion3));
+  ASSERT_EQ(0, memcmp(buf3, buf, sizeof(buf)));
+  rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
+  rados_aio_release(my_completion3);
+}
+
+TEST(LibRadosAioEC, MultiWritePP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+
+  char buf2[64];
+  memset(buf2, 0xdd, sizeof(buf2));
+  bufferlist bl2;
+  bl2.append(buf2, sizeof(buf2));
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
+					   bl2, sizeof(buf2), sizeof(buf)));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(-EOPNOTSUPP, my_completion2->get_return_value());
+
+  bufferlist bl3;
+  AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+  ASSERT_NE(my_completion3, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
+					  &bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion3->wait_for_complete());
+  }
+  ASSERT_EQ((int)sizeof(buf), my_completion3->get_return_value());
+  ASSERT_EQ(sizeof(buf), bl3.length());
+  ASSERT_EQ(0, memcmp(bl3.c_str(), buf, sizeof(buf)));
+  delete my_completion;
+  delete my_completion2;
+  delete my_completion3;
+}
diff --git a/src/test/librados/c_read_operations.cc b/src/test/librados/c_read_operations.cc
index 106f6b4..3ca31f4 100644
--- a/src/test/librados/c_read_operations.cc
+++ b/src/test/librados/c_read_operations.cc
@@ -16,7 +16,7 @@ class CReadOpsTest : public RadosTest {
 protected:
   void write_object() {
     // Create an object and write to it
-    ASSERT_EQ(len, rados_write(ioctx, obj, data, len, 0));
+    ASSERT_EQ(0, rados_write(ioctx, obj, data, len, 0));
   }
   void remove_object() {
     ASSERT_EQ(0, rados_remove(ioctx, obj));
diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc
index ebad7c7..5daca3c 100644
--- a/src/test/librados/io.cc
+++ b/src/test/librados/io.cc
@@ -20,9 +20,9 @@ typedef RadosTestECPP LibRadosIoECPP;
 TEST_F(LibRadosIo, SimpleWrite) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "nspace");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
 }
 
 TEST_F(LibRadosIoPP, SimpleWritePP) {
@@ -30,9 +30,9 @@ TEST_F(LibRadosIoPP, SimpleWritePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   ioctx.set_namespace("nspace");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
 }
 
 TEST_F(LibRadosIoPP, ReadOpPP) {
@@ -40,7 +40,7 @@ TEST_F(LibRadosIoPP, ReadOpPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
 
   {
       bufferlist op_bl;
@@ -161,7 +161,7 @@ TEST_F(LibRadosIo, RoundTrip) {
   char buf[128];
   char buf2[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ((int)sizeof(buf2), rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
@@ -169,14 +169,13 @@ TEST_F(LibRadosIo, RoundTrip) {
 
 TEST_F(LibRadosIoPP, RoundTripPP) {
   char buf[128];
-  char buf2[128];
   Rados cluster;
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   bufferlist cl;
-  ASSERT_EQ((int)sizeof(buf2), ioctx.read("foo", cl, sizeof(buf), 0));
+  ASSERT_EQ((int)sizeof(buf), ioctx.read("foo", cl, sizeof(buf), 0));
   ASSERT_EQ(0, memcmp(buf, cl.c_str(), sizeof(buf)));
 }
 
@@ -185,9 +184,9 @@ TEST_F(LibRadosIo, OverlappingWriteRoundTrip) {
   char buf2[64];
   char buf3[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
   memset(buf3, 0xdd, sizeof(buf3));
   ASSERT_EQ((int)sizeof(buf3), rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
   ASSERT_EQ(0, memcmp(buf3, buf2, sizeof(buf2)));
@@ -200,11 +199,11 @@ TEST_F(LibRadosIoPP, OverlappingWriteRoundTripPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), 0));
   bufferlist bl3;
   ASSERT_EQ((int)sizeof(buf), ioctx.read("foo", bl3, sizeof(buf), 0));
   ASSERT_EQ(0, memcmp(bl3.c_str(), buf2, sizeof(buf2)));
@@ -216,7 +215,7 @@ TEST_F(LibRadosIo, WriteFullRoundTrip) {
   char buf2[64];
   char buf3[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   memset(buf2, 0xdd, sizeof(buf2));
   ASSERT_EQ(0, rados_write_full(ioctx, "foo", buf2, sizeof(buf2)));
   memset(buf3, 0xdd, sizeof(buf3));
@@ -230,7 +229,7 @@ TEST_F(LibRadosIoPP, WriteFullRoundTripPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
@@ -245,9 +244,9 @@ TEST_F(LibRadosIo, AppendRoundTrip) {
   char buf2[64];
   char buf3[sizeof(buf) + sizeof(buf2)];
   memset(buf, 0xde, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   memset(buf2, 0xad, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), rados_append(ioctx, "foo", buf2, sizeof(buf2)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf2, sizeof(buf2)));
   memset(buf3, 0, sizeof(buf3));
   ASSERT_EQ((int)sizeof(buf3), rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
   ASSERT_EQ(0, memcmp(buf3, buf, sizeof(buf)));
@@ -260,11 +259,11 @@ TEST_F(LibRadosIoPP, AppendRoundTripPP) {
   memset(buf, 0xde, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   memset(buf2, 0xad, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.append("foo", bl2, sizeof(buf2)));
+  ASSERT_EQ(0, ioctx.append("foo", bl2, sizeof(buf2)));
   bufferlist bl3;
   ASSERT_EQ((int)(sizeof(buf) + sizeof(buf2)),
 	    ioctx.read("foo", bl3, (sizeof(buf) + sizeof(buf2)), 0));
@@ -277,7 +276,7 @@ TEST_F(LibRadosIo, TruncTest) {
   char buf[128];
   char buf2[sizeof(buf)];
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0, rados_trunc(ioctx, "foo", sizeof(buf) / 2));
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ((int)(sizeof(buf)/2), rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
@@ -289,7 +288,7 @@ TEST_F(LibRadosIoPP, TruncTestPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl, sizeof(buf)));
   ASSERT_EQ(0, ioctx.trunc("foo", sizeof(buf) / 2));
   bufferlist bl2;
   ASSERT_EQ((int)(sizeof(buf)/2), ioctx.read("foo", bl2, sizeof(buf), 0));
@@ -300,7 +299,7 @@ TEST_F(LibRadosIo, RemoveTest) {
   char buf[128];
   char buf2[sizeof(buf)];
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0, rados_remove(ioctx, "foo"));
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ(-ENOENT, rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
@@ -311,7 +310,7 @@ TEST_F(LibRadosIoPP, RemoveTestPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   ASSERT_EQ(0, ioctx.remove("foo"));
   bufferlist bl2;
   ASSERT_EQ(-ENOENT, ioctx.read("foo", bl2, sizeof(buf), 0));
@@ -322,7 +321,7 @@ TEST_F(LibRadosIo, XattrsRoundTrip) {
   char attr1[] = "attr1";
   char attr1_buf[] = "foo bar baz";
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(-ENODATA, rados_getxattr(ioctx, "foo", attr1, buf, sizeof(buf)));
   ASSERT_EQ(0, rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ((int)sizeof(attr1_buf),
@@ -337,7 +336,7 @@ TEST_F(LibRadosIoPP, XattrsRoundTripPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   bufferlist bl2;
   ASSERT_EQ(-ENODATA, ioctx.getxattr("foo", attr1, bl2));
   bufferlist bl3;
@@ -354,7 +353,7 @@ TEST_F(LibRadosIo, RmXattr) {
   char attr1[] = "attr1";
   char attr1_buf[] = "foo bar baz";
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0,
       rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ(0, rados_rmxattr(ioctx, "foo", attr1));
@@ -368,7 +367,7 @@ TEST_F(LibRadosIoPP, RmXattrPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   bufferlist bl2;
   bl2.append(attr1_buf, sizeof(attr1_buf));
   ASSERT_EQ(0, ioctx.setxattr("foo", attr1, bl2));
@@ -387,7 +386,7 @@ TEST_F(LibRadosIo, XattrIter) {
     attr2_buf[j] = j % 0xff;
   }
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0, rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ(0, rados_setxattr(ioctx, "foo", attr2, attr2_buf, sizeof(attr2_buf)));
   rados_xattrs_iter_t iter;
@@ -429,7 +428,7 @@ TEST_F(LibRadosIoPP, XattrListPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   bufferlist bl2;
   bl2.append(attr1_buf, sizeof(attr1_buf));
   ASSERT_EQ(0, ioctx.setxattr("foo", attr1, bl2));
@@ -455,9 +454,9 @@ TEST_F(LibRadosIoPP, XattrListPP) {
 TEST_F(LibRadosIoEC, SimpleWrite) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "nspace");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
 }
 
 TEST_F(LibRadosIoECPP, SimpleWritePP) {
@@ -465,9 +464,9 @@ TEST_F(LibRadosIoECPP, SimpleWritePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   ioctx.set_namespace("nspace");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
 }
 
 TEST_F(LibRadosIoECPP, ReadOpPP) {
@@ -475,7 +474,7 @@ TEST_F(LibRadosIoECPP, ReadOpPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
 
   {
       bufferlist op_bl;
@@ -596,7 +595,7 @@ TEST_F(LibRadosIoEC, RoundTrip) {
   char buf[128];
   char buf2[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ((int)sizeof(buf2), rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
@@ -604,14 +603,13 @@ TEST_F(LibRadosIoEC, RoundTrip) {
 
 TEST_F(LibRadosIoECPP, RoundTripPP) {
   char buf[128];
-  char buf2[128];
   Rados cluster;
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   bufferlist cl;
-  ASSERT_EQ((int)sizeof(buf2), ioctx.read("foo", cl, sizeof(buf), 0));
+  ASSERT_EQ((int)sizeof(buf), ioctx.read("foo", cl, sizeof(buf) * 3, 0));
   ASSERT_EQ(0, memcmp(buf, cl.c_str(), sizeof(buf)));
 }
 
@@ -622,10 +620,10 @@ TEST_F(LibRadosIoEC, OverlappingWriteRoundTrip) {
   char *buf2 = (char *)new char[bsize];
   char *buf3 = (char *)new char[dbsize];
   memset(buf, 0xcc, dbsize);
-  ASSERT_EQ(dbsize, rados_write(ioctx, "foo", buf, dbsize, 0));
-  memset(buf2, 0xdd, sizeof(buf2));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, dbsize, 0));
+  memset(buf2, 0xdd, bsize);
   ASSERT_EQ(-EOPNOTSUPP, rados_write(ioctx, "foo", buf2, bsize, 0));
-  memset(buf3, 0xdd, sizeof(buf3));
+  memset(buf3, 0xdd, dbsize);
   ASSERT_EQ(dbsize, rados_read(ioctx, "foo", buf3, dbsize, 0));
   // Read the same as first write
   ASSERT_EQ(0, memcmp(buf3, buf, dbsize));
@@ -643,7 +641,7 @@ TEST_F(LibRadosIoECPP, OverlappingWriteRoundTripPP) {
   memset(buf, 0xcc, dbsize);
   bufferlist bl1;
   bl1.append(buf, dbsize);
-  ASSERT_EQ(dbsize, ioctx.write("foo", bl1, dbsize, 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, dbsize, 0));
   memset(buf2, 0xdd, bsize);
   bufferlist bl2;
   bl2.append(buf2, bsize);
@@ -662,12 +660,12 @@ TEST_F(LibRadosIoEC, WriteFullRoundTrip) {
   char buf2[64];
   char buf3[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   memset(buf2, 0xdd, sizeof(buf2));
   ASSERT_EQ(0, rados_write_full(ioctx, "foo", buf2, sizeof(buf2)));
-  memset(buf3, 0xdd, sizeof(buf3));
+  memset(buf3, 0xee, sizeof(buf3));
   ASSERT_EQ((int)sizeof(buf2), rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
-  ASSERT_EQ(0, memcmp(buf2, buf2, sizeof(buf2)));
+  ASSERT_EQ(0, memcmp(buf3, buf2, sizeof(buf2)));
 }
 
 TEST_F(LibRadosIoECPP, WriteFullRoundTripPP) {
@@ -676,7 +674,7 @@ TEST_F(LibRadosIoECPP, WriteFullRoundTripPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
@@ -691,17 +689,23 @@ TEST_F(LibRadosIoEC, AppendRoundTrip) {
   char *buf2 = (char *)new char[alignment];
   char *buf3 = (char *)new char[alignment *2];
   memset(buf, 0xde, alignment);
-  ASSERT_EQ((int)alignment, rados_append(ioctx, "foo", buf, alignment));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, alignment));
   memset(buf2, 0xad, alignment);
-  ASSERT_EQ((int)alignment, rados_append(ioctx, "foo", buf2, alignment));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf2, alignment));
   memset(buf3, 0, alignment*2);
   ASSERT_EQ((int)alignment*2, rados_read(ioctx, "foo", buf3, alignment*2, 0));
   ASSERT_EQ(0, memcmp(buf3, buf, alignment));
   ASSERT_EQ(0, memcmp(buf3 + alignment, buf2, alignment));
 
+  int uasize = alignment/2;
+  char *unalignedbuf = (char *)new char[uasize];
+  ASSERT_EQ(0, rados_append(ioctx, "foo", unalignedbuf, uasize));
+  ASSERT_EQ(-EOPNOTSUPP, rados_append(ioctx, "foo", unalignedbuf, uasize));
+
   delete[] buf;
   delete[] buf2;
   delete[] buf3;
+  delete[] unalignedbuf;
 }
 
 TEST_F(LibRadosIoECPP, AppendRoundTripPP) {
@@ -710,14 +714,14 @@ TEST_F(LibRadosIoECPP, AppendRoundTripPP) {
   memset(buf, 0xde, alignment);
   bufferlist bl1;
   bl1.append(buf, alignment);
-  ASSERT_EQ((int)alignment, ioctx.append("foo", bl1, alignment));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, alignment));
   memset(buf2, 0xad, alignment);
   bufferlist bl2;
   bl2.append(buf2, alignment);
-  ASSERT_EQ((int)alignment, ioctx.append("foo", bl2, alignment));
+  ASSERT_EQ(0, ioctx.append("foo", bl2, alignment));
   bufferlist bl3;
   ASSERT_EQ((int)(alignment * 2),
-	    ioctx.read("foo", bl3, (alignment * 2), 0));
+	    ioctx.read("foo", bl3, (alignment * 4), 0));
   const char *bl3_str = bl3.c_str();
   ASSERT_EQ(0, memcmp(bl3_str, buf, alignment));
   ASSERT_EQ(0, memcmp(bl3_str + alignment, buf2, alignment));
@@ -730,7 +734,7 @@ TEST_F(LibRadosIoEC, TruncTest) {
   char buf[128];
   char buf2[sizeof(buf)];
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(-EOPNOTSUPP, rados_trunc(ioctx, "foo", sizeof(buf) / 2));
   memset(buf2, 0, sizeof(buf2));
   // Same size
@@ -744,7 +748,7 @@ TEST_F(LibRadosIoECPP, TruncTestPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl, sizeof(buf)));
   ASSERT_EQ(-EOPNOTSUPP, ioctx.trunc("foo", sizeof(buf) / 2));
   bufferlist bl2;
   // Same size
@@ -757,7 +761,7 @@ TEST_F(LibRadosIoEC, RemoveTest) {
   char buf[128];
   char buf2[sizeof(buf)];
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0, rados_remove(ioctx, "foo"));
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ(-ENOENT, rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
@@ -768,7 +772,7 @@ TEST_F(LibRadosIoECPP, RemoveTestPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   ASSERT_EQ(0, ioctx.remove("foo"));
   bufferlist bl2;
   ASSERT_EQ(-ENOENT, ioctx.read("foo", bl2, sizeof(buf), 0));
@@ -779,7 +783,7 @@ TEST_F(LibRadosIoEC, XattrsRoundTrip) {
   char attr1[] = "attr1";
   char attr1_buf[] = "foo bar baz";
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(-ENODATA, rados_getxattr(ioctx, "foo", attr1, buf, sizeof(buf)));
   ASSERT_EQ(0, rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ((int)sizeof(attr1_buf),
@@ -794,7 +798,7 @@ TEST_F(LibRadosIoECPP, XattrsRoundTripPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   bufferlist bl2;
   ASSERT_EQ(-ENODATA, ioctx.getxattr("foo", attr1, bl2));
   bufferlist bl3;
@@ -811,7 +815,7 @@ TEST_F(LibRadosIoEC, RmXattr) {
   char attr1[] = "attr1";
   char attr1_buf[] = "foo bar baz";
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0,
       rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ(0, rados_rmxattr(ioctx, "foo", attr1));
@@ -825,7 +829,7 @@ TEST_F(LibRadosIoECPP, RmXattrPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   bufferlist bl2;
   bl2.append(attr1_buf, sizeof(attr1_buf));
   ASSERT_EQ(0, ioctx.setxattr("foo", attr1, bl2));
@@ -844,7 +848,7 @@ TEST_F(LibRadosIoEC, XattrIter) {
     attr2_buf[j] = j % 0xff;
   }
   memset(buf, 0xaa, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_append(ioctx, "foo", buf, sizeof(buf)));
+  ASSERT_EQ(0, rados_append(ioctx, "foo", buf, sizeof(buf)));
   ASSERT_EQ(0, rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ(0, rados_setxattr(ioctx, "foo", attr2, attr2_buf, sizeof(attr2_buf)));
   rados_xattrs_iter_t iter;
@@ -886,7 +890,7 @@ TEST_F(LibRadosIoECPP, XattrListPP) {
   memset(buf, 0xaa, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.append("foo", bl1, sizeof(buf)));
+  ASSERT_EQ(0, ioctx.append("foo", bl1, sizeof(buf)));
   bufferlist bl2;
   bl2.append(attr1_buf, sizeof(attr1_buf));
   ASSERT_EQ(0, ioctx.setxattr("foo", attr1, bl2));
diff --git a/src/test/librados/list.cc b/src/test/librados/list.cc
index 2e400d5..e3fe007 100644
--- a/src/test/librados/list.cc
+++ b/src/test/librados/list.cc
@@ -20,7 +20,7 @@ typedef RadosTestECPP LibRadosListECPP;
 TEST_F(LibRadosList, ListObjects) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   rados_list_ctx_t ctx;
   ASSERT_EQ(0, rados_objects_list_open(ioctx, &ctx));
   const char *entry;
@@ -38,7 +38,7 @@ TEST_F(LibRadosListPP, ListObjectsPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ObjectIterator iter(ioctx.objects_begin());
   bool foundit = false;
   while (iter != ioctx.objects_end()) {
@@ -54,7 +54,7 @@ TEST_F(LibRadosListPP, ListObjectsTwicePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ObjectIterator iter(ioctx.objects_begin());
   bool foundit = false;
   while (iter != ioctx.objects_end()) {
@@ -80,7 +80,7 @@ TEST_F(LibRadosListPP, ListObjectsCopyIterPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   // make sure this is still valid after the original iterators are gone
   ObjectIterator iter3;
@@ -112,7 +112,7 @@ TEST_F(LibRadosListPP, ListObjectsEndIter) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   ObjectIterator iter(ioctx.objects_begin());
   ObjectIterator iter_end(ioctx.objects_end());
@@ -154,18 +154,18 @@ TEST_F(LibRadosList, ListObjectsNS) {
   memset(buf, 0xcc, sizeof(buf));
   // Create :foo1, :foo2, :foo3, n1:foo1, ns1:foo4, ns1:foo5, ns2:foo6, n2:foo7
   rados_ioctx_set_namespace(ioctx, "");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "ns1");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo3", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo3", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "ns1");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo4", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo5", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo4", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo5", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "ns2");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo6", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo7", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo6", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo7", buf, sizeof(buf), 0));
 
   std::set<std::string> def, ns1, ns2;
   def.insert(std::string("foo1"));
@@ -222,18 +222,18 @@ TEST_F(LibRadosListPP, ListObjectsPPNS) {
   bl1.append(buf, sizeof(buf));
   // Create :foo1, :foo2, :foo3, n1:foo1, ns1:foo4, ns1:foo5, ns2:foo6, n2:foo7
   ioctx.set_namespace("");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo1", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo1", bl1, sizeof(buf), 0));
   ioctx.set_namespace("ns1");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo1", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo1", bl1, sizeof(buf), 0));
   ioctx.set_namespace("");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo2", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo3", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo2", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo3", bl1, sizeof(buf), 0));
   ioctx.set_namespace("ns1");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo4", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo5", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo4", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo5", bl1, sizeof(buf), 0));
   ioctx.set_namespace("ns2");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo6", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo7", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo6", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo7", bl1, sizeof(buf), 0));
 
   std::set<std::string> def, ns1, ns2;
   def.insert(std::string("foo1"));
@@ -262,7 +262,7 @@ TEST_F(LibRadosListPP, ListObjectsManyPP) {
   bl.append(buf, sizeof(buf));
 
   for (int i=0; i<256; ++i) {
-    ASSERT_EQ((int)sizeof(buf), ioctx.write(stringify(i), bl, bl.length(), 0));
+    ASSERT_EQ(0, ioctx.write(stringify(i), bl, bl.length(), 0));
   }
 
   librados::ObjectIterator it = ioctx.objects_begin();
@@ -287,7 +287,7 @@ TEST_F(LibRadosList, ListObjectsStart) {
 
   for (int i=0; i<16; ++i) {
     string n = stringify(i);
-    ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, n.c_str(), buf, sizeof(buf), 0));
+    ASSERT_EQ(0, rados_write(ioctx, n.c_str(), buf, sizeof(buf), 0));
   }
 
   rados_list_ctx_t ctx;
@@ -321,7 +321,7 @@ TEST_F(LibRadosListPP, ListObjectsStartPP) {
   bl.append(buf, sizeof(buf));
 
   for (int i=0; i<16; ++i) {
-    ASSERT_EQ((int)sizeof(buf), ioctx.write(stringify(i), bl, bl.length(), 0));
+    ASSERT_EQ(0, ioctx.write(stringify(i), bl, bl.length(), 0));
   }
 
   librados::ObjectIterator it = ioctx.objects_begin();
@@ -345,7 +345,7 @@ TEST_F(LibRadosListPP, ListObjectsStartPP) {
 TEST_F(LibRadosListEC, ListObjects) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   rados_list_ctx_t ctx;
   ASSERT_EQ(0, rados_objects_list_open(ioctx, &ctx));
   const char *entry;
@@ -363,7 +363,7 @@ TEST_F(LibRadosListECPP, ListObjectsPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ObjectIterator iter(ioctx.objects_begin());
   bool foundit = false;
   while (iter != ioctx.objects_end()) {
@@ -379,7 +379,7 @@ TEST_F(LibRadosListECPP, ListObjectsTwicePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ObjectIterator iter(ioctx.objects_begin());
   bool foundit = false;
   while (iter != ioctx.objects_end()) {
@@ -405,7 +405,7 @@ TEST_F(LibRadosListECPP, ListObjectsCopyIterPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   // make sure this is still valid after the original iterators are gone
   ObjectIterator iter3;
@@ -437,7 +437,7 @@ TEST_F(LibRadosListECPP, ListObjectsEndIter) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   ObjectIterator iter(ioctx.objects_begin());
   ObjectIterator iter_end(ioctx.objects_end());
@@ -462,18 +462,18 @@ TEST_F(LibRadosListEC, ListObjectsNS) {
   memset(buf, 0xcc, sizeof(buf));
   // Create :foo1, :foo2, :foo3, n1:foo1, ns1:foo4, ns1:foo5, ns2:foo6, n2:foo7
   rados_ioctx_set_namespace(ioctx, "");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "ns1");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo1", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo3", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo3", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "ns1");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo4", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo5", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo4", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo5", buf, sizeof(buf), 0));
   rados_ioctx_set_namespace(ioctx, "ns2");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo6", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo7", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo6", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo7", buf, sizeof(buf), 0));
 
   std::set<std::string> def, ns1, ns2;
   def.insert(std::string("foo1"));
@@ -512,18 +512,18 @@ TEST_F(LibRadosListECPP, ListObjectsPPNS) {
   bl1.append(buf, sizeof(buf));
   // Create :foo1, :foo2, :foo3, n1:foo1, ns1:foo4, ns1:foo5, ns2:foo6, n2:foo7
   ioctx.set_namespace("");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo1", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo1", bl1, sizeof(buf), 0));
   ioctx.set_namespace("ns1");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo1", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo1", bl1, sizeof(buf), 0));
   ioctx.set_namespace("");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo2", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo3", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo2", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo3", bl1, sizeof(buf), 0));
   ioctx.set_namespace("ns1");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo4", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo5", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo4", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo5", bl1, sizeof(buf), 0));
   ioctx.set_namespace("ns2");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo6", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo7", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo6", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo7", bl1, sizeof(buf), 0));
 
   std::set<std::string> def, ns1, ns2;
   def.insert(std::string("foo1"));
@@ -552,7 +552,7 @@ TEST_F(LibRadosListECPP, ListObjectsManyPP) {
   bl.append(buf, sizeof(buf));
 
   for (int i=0; i<256; ++i) {
-    ASSERT_EQ((int)sizeof(buf), ioctx.write(stringify(i), bl, bl.length(), 0));
+    ASSERT_EQ(0, ioctx.write(stringify(i), bl, bl.length(), 0));
   }
 
   librados::ObjectIterator it = ioctx.objects_begin();
@@ -577,7 +577,7 @@ TEST_F(LibRadosListEC, ListObjectsStart) {
 
   for (int i=0; i<16; ++i) {
     string n = stringify(i);
-    ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, n.c_str(), buf, sizeof(buf), 0));
+    ASSERT_EQ(0, rados_write(ioctx, n.c_str(), buf, sizeof(buf), 0));
   }
 
   rados_list_ctx_t ctx;
@@ -611,7 +611,7 @@ TEST_F(LibRadosListECPP, ListObjectsStartPP) {
   bl.append(buf, sizeof(buf));
 
   for (int i=0; i<16; ++i) {
-    ASSERT_EQ((int)sizeof(buf), ioctx.write(stringify(i), bl, bl.length(), 0));
+    ASSERT_EQ(0, ioctx.write(stringify(i), bl, bl.length(), 0));
   }
 
   librados::ObjectIterator it = ioctx.objects_begin();
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 233cc6b..ea990b5 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -289,7 +289,7 @@ TEST_F(LibRadosMiscPP, Tmap2OmapPP) {
 TEST_F(LibRadosMisc, Exec) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   char buf2[512];
   int res = rados_exec(ioctx, "foo", "rbd", "get_all_features",
 			  NULL, 0, buf2, sizeof(buf2));
@@ -378,7 +378,7 @@ TEST_F(LibRadosMiscPP, Operate2PP) {
 TEST_F(LibRadosMiscPP, BigObjectPP) {
   bufferlist bl;
   bl.append("abcdefg");
-  ASSERT_EQ((int)bl.length(), ioctx.write("foo", bl, bl.length(), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, bl.length(), 0));
 
   {
     ObjectWriteOperation o;
@@ -451,7 +451,7 @@ TEST_F(LibRadosMiscPP, CloneRangePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ(sizeof(buf), (size_t)ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   ioctx.locator_set_key("foo");
   ASSERT_EQ(0, ioctx.clone_range("bar", 0, "foo", 0, sizeof(buf)));
   bufferlist bl2;
@@ -462,7 +462,7 @@ TEST_F(LibRadosMiscPP, CloneRangePP) {
 TEST_F(LibRadosMisc, CloneRange) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "src", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "src", buf, sizeof(buf), 0));
   rados_ioctx_locator_set_key(ioctx, "src");
   ASSERT_EQ(0, rados_clone_range(ioctx, "dst", 0, "src", 0, sizeof(buf)));
   char buf2[sizeof(buf)];
@@ -496,6 +496,7 @@ TEST_F(LibRadosMiscPP, BigAttrPP) {
 
   bufferlist got;
 
+  cout << "osd_max_attr_size = " << g_conf->osd_max_attr_size << std::endl;
   if (g_conf->osd_max_attr_size) {
     bl.clear();
     got.clear();
diff --git a/src/test/librados/snapshots.cc b/src/test/librados/snapshots.cc
index 436b918..020af11 100644
--- a/src/test/librados/snapshots.cc
+++ b/src/test/librados/snapshots.cc
@@ -24,7 +24,7 @@ const int bufsize = 128;
 TEST_F(LibRadosSnapshots, SnapList) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snap1"));
   rados_snap_t snaps[10];
   EXPECT_EQ(1, rados_ioctx_snap_list(ioctx, snaps,
@@ -40,7 +40,7 @@ TEST_F(LibRadosSnapshotsPP, SnapListPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snap1"));
   std::vector<snap_t> snaps;
   EXPECT_EQ(0, ioctx.snap_list(&snaps));
@@ -54,7 +54,7 @@ TEST_F(LibRadosSnapshotsPP, SnapListPP) {
 TEST_F(LibRadosSnapshots, SnapRemove) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snap1"));
   rados_snap_t rid;
   ASSERT_EQ(0, rados_ioctx_snap_lookup(ioctx, "snap1", &rid));
@@ -68,7 +68,7 @@ TEST_F(LibRadosSnapshotsPP, SnapRemovePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snap1"));
   rados_snap_t rid;
   ASSERT_EQ(0, ioctx.snap_lookup("snap1", &rid));
@@ -79,12 +79,12 @@ TEST_F(LibRadosSnapshotsPP, SnapRemovePP) {
 TEST_F(LibRadosSnapshots, Rollback) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snap1"));
   char buf2[sizeof(buf)];
   memset(buf2, 0xdd, sizeof(buf2));
   EXPECT_EQ(0, rados_write_full(ioctx, "foo", buf2, sizeof(buf2)));
-  EXPECT_EQ(0, rados_rollback(ioctx, "foo", "snap1"));
+  EXPECT_EQ(0, rados_ioctx_snap_rollback(ioctx, "foo", "snap1"));
   char buf3[sizeof(buf)];
   EXPECT_EQ((int)sizeof(buf3), rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
   EXPECT_EQ(0, memcmp(buf, buf3, sizeof(buf)));
@@ -96,14 +96,14 @@ TEST_F(LibRadosSnapshotsPP, RollbackPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snap1"));
   char buf2[sizeof(buf)];
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   EXPECT_EQ(0, ioctx.write_full("foo", bl2));
-  EXPECT_EQ(0, ioctx.rollback("foo", "snap1"));
+  EXPECT_EQ(0, ioctx.snap_rollback("foo", "snap1"));
   bufferlist bl3;
   EXPECT_EQ((int)sizeof(buf), ioctx.read("foo", bl3, sizeof(buf), 0));
   EXPECT_EQ(0, memcmp(buf, bl3.c_str(), sizeof(buf)));
@@ -113,7 +113,7 @@ TEST_F(LibRadosSnapshotsPP, RollbackPP) {
 TEST_F(LibRadosSnapshots, SnapGetName) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snapfoo"));
   rados_snap_t rid;
   EXPECT_EQ(0, rados_ioctx_snap_lookup(ioctx, "snapfoo", &rid));
@@ -132,7 +132,7 @@ TEST_F(LibRadosSnapshotsPP, SnapGetNamePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snapfoo"));
   rados_snap_t rid;
   EXPECT_EQ(0, ioctx.snap_lookup("snapfoo", &rid));
@@ -155,7 +155,7 @@ TEST_F(LibRadosSnapshotsSelfManaged, Snap) {
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
 
   my_snaps.push_back(-2);
   ASSERT_EQ(0, rados_ioctx_selfmanaged_snap_create(ioctx, &my_snaps.back()));
@@ -165,7 +165,7 @@ TEST_F(LibRadosSnapshotsSelfManaged, Snap) {
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   char buf2[sizeof(buf)];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
   rados_ioctx_snap_set_read(ioctx, my_snaps[1]-1);
   char buf3[sizeof(buf)];
   ASSERT_EQ(-ENOENT, rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
@@ -192,7 +192,7 @@ TEST_F(LibRadosSnapshotsSelfManaged, Rollback) {
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
 
   my_snaps.push_back(-2);
   ASSERT_EQ(0, rados_ioctx_selfmanaged_snap_create(ioctx, &my_snaps.back()));
@@ -202,7 +202,7 @@ TEST_F(LibRadosSnapshotsSelfManaged, Rollback) {
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   char buf2[sizeof(buf)];
   memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
   rados_ioctx_selfmanaged_snap_rollback(ioctx, "foo", my_snaps[1]);
   char buf3[sizeof(buf)];
   ASSERT_EQ((int)sizeof(buf3), rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
@@ -226,7 +226,7 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, SnapPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   my_snaps.push_back(-2);
   ASSERT_EQ(0, ioctx.selfmanaged_snap_create(&my_snaps.back()));
@@ -237,7 +237,7 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, SnapPP) {
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), 0));
 
   ioctx.snap_set_read(my_snaps[1]);
   bufferlist bl3;
@@ -269,9 +269,9 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, RollbackPP) {
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
   //Write 3 consecutive buffers
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), bufsize));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), bufsize*2));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), bufsize));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), bufsize*2));
 
   snap_set_t ss;
 
@@ -293,9 +293,9 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, RollbackPP) {
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   //Change the middle buffer
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize));
   //Add another after
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize*3));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize*3));
 
   ASSERT_EQ(-EINVAL, ioctx.list_snaps("foo", &ss));
   ObjectReadOperation o;
@@ -352,11 +352,11 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, SnapOverlapPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), bufsize*2));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), bufsize*4));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), bufsize*6));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), bufsize*8));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), bufsize*2));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), bufsize*4));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), bufsize*6));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), bufsize*8));
 
   snap_set_t ss;
   snap_t head = SNAP_HEAD;
@@ -376,11 +376,11 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, SnapOverlapPP) {
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize*1));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize*3));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize*5));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize*7));
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize*9));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize*1));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize*3));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize*5));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize*7));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize*9));
 
   ASSERT_EQ(0, readioctx.list_snaps("foo", &ss));
   ASSERT_EQ(2u, ss.clones.size());
@@ -414,10 +414,10 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, SnapOverlapPP) {
   memset(buf3, 0xee, sizeof(buf3));
   bufferlist bl4;
   bl4.append(buf3, sizeof(buf3));
-  ASSERT_EQ((int)sizeof(buf3), ioctx.write("foo", bl2, sizeof(buf3), bufsize*1));
-  ASSERT_EQ((int)sizeof(buf3), ioctx.write("foo", bl2, sizeof(buf3), bufsize*4));
-  ASSERT_EQ((int)sizeof(buf3), ioctx.write("foo", bl2, sizeof(buf3), bufsize*5));
-  ASSERT_EQ((int)sizeof(buf3), ioctx.write("foo", bl2, sizeof(buf3), bufsize*8));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf3), bufsize*1));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf3), bufsize*4));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf3), bufsize*5));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf3), bufsize*8));
 
   ASSERT_EQ(0, readioctx.list_snaps("foo", &ss));
   ASSERT_EQ(3u, ss.clones.size());
@@ -469,7 +469,7 @@ TEST_F(LibRadosSnapshotsSelfManagedPP, SnapOverlapPP) {
 TEST_F(LibRadosSnapshotsEC, SnapList) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snap1"));
   rados_snap_t snaps[10];
   EXPECT_EQ(1, rados_ioctx_snap_list(ioctx, snaps,
@@ -485,7 +485,7 @@ TEST_F(LibRadosSnapshotsECPP, SnapListPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snap1"));
   std::vector<snap_t> snaps;
   EXPECT_EQ(0, ioctx.snap_list(&snaps));
@@ -499,7 +499,7 @@ TEST_F(LibRadosSnapshotsECPP, SnapListPP) {
 TEST_F(LibRadosSnapshotsEC, SnapRemove) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snap1"));
   rados_snap_t rid;
   ASSERT_EQ(0, rados_ioctx_snap_lookup(ioctx, "snap1", &rid));
@@ -513,7 +513,7 @@ TEST_F(LibRadosSnapshotsECPP, SnapRemovePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snap1"));
   rados_snap_t rid;
   ASSERT_EQ(0, ioctx.snap_lookup("snap1", &rid));
@@ -524,12 +524,12 @@ TEST_F(LibRadosSnapshotsECPP, SnapRemovePP) {
 TEST_F(LibRadosSnapshotsEC, Rollback) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snap1"));
   char buf2[sizeof(buf)];
   memset(buf2, 0xdd, sizeof(buf2));
   EXPECT_EQ(0, rados_write_full(ioctx, "foo", buf2, sizeof(buf2)));
-  EXPECT_EQ(0, rados_rollback(ioctx, "foo", "snap1"));
+  EXPECT_EQ(0, rados_ioctx_snap_rollback(ioctx, "foo", "snap1"));
   char buf3[sizeof(buf)];
   EXPECT_EQ((int)sizeof(buf3), rados_read(ioctx, "foo", buf3, sizeof(buf3), 0));
   EXPECT_EQ(0, memcmp(buf, buf3, sizeof(buf)));
@@ -541,14 +541,14 @@ TEST_F(LibRadosSnapshotsECPP, RollbackPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snap1"));
   char buf2[sizeof(buf)];
   memset(buf2, 0xdd, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   EXPECT_EQ(0, ioctx.write_full("foo", bl2));
-  EXPECT_EQ(0, ioctx.rollback("foo", "snap1"));
+  EXPECT_EQ(0, ioctx.snap_rollback("foo", "snap1"));
   bufferlist bl3;
   EXPECT_EQ((int)sizeof(buf), ioctx.read("foo", bl3, sizeof(buf), 0));
   EXPECT_EQ(0, memcmp(buf, bl3.c_str(), sizeof(buf)));
@@ -558,7 +558,7 @@ TEST_F(LibRadosSnapshotsECPP, RollbackPP) {
 TEST_F(LibRadosSnapshotsEC, SnapGetName) {
   char buf[bufsize];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   ASSERT_EQ(0, rados_ioctx_snap_create(ioctx, "snapfoo"));
   rados_snap_t rid;
   EXPECT_EQ(0, rados_ioctx_snap_lookup(ioctx, "snapfoo", &rid));
@@ -577,7 +577,7 @@ TEST_F(LibRadosSnapshotsECPP, SnapGetNamePP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   ASSERT_EQ(0, ioctx.snap_create("snapfoo"));
   rados_snap_t rid;
   EXPECT_EQ(0, ioctx.snap_lookup("snapfoo", &rid));
@@ -601,7 +601,7 @@ TEST_F(LibRadosSnapshotsSelfManagedEC, Snap) {
   int bsize = alignment;
   char *buf = (char *)new char[bsize];
   memset(buf, 0xcc, bsize);
-  ASSERT_EQ(bsize, rados_write(ioctx, "foo", buf, bsize, 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, bsize, 0));
 
   my_snaps.push_back(-2);
   ASSERT_EQ(0, rados_ioctx_selfmanaged_snap_create(ioctx, &my_snaps.back()));
@@ -610,8 +610,8 @@ TEST_F(LibRadosSnapshotsSelfManagedEC, Snap) {
 					&my_snaps[0], my_snaps.size()));
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   char *buf2 = (char *)new char[bsize];
-  memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(bsize, rados_write(ioctx, "foo", buf2, bsize, bsize));
+  memset(buf2, 0xdd, bsize);
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, bsize, bsize));
   rados_ioctx_snap_set_read(ioctx, my_snaps[1]-1);
   char *buf3 = (char *)new char[bsize*2];
   ASSERT_EQ(-ENOENT, rados_read(ioctx, "foo", buf3, bsize*2, 0));
@@ -641,8 +641,8 @@ TEST_F(LibRadosSnapshotsSelfManagedEC, Rollback) {
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   int bsize = alignment;
   char *buf = (char *)new char[bsize];
-  memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ(bsize, rados_write(ioctx, "foo", buf, bsize, 0));
+  memset(buf, 0xcc, bsize);
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, bsize, 0));
 
   my_snaps.push_back(-2);
   ASSERT_EQ(0, rados_ioctx_selfmanaged_snap_create(ioctx, &my_snaps.back()));
@@ -651,9 +651,9 @@ TEST_F(LibRadosSnapshotsSelfManagedEC, Rollback) {
 					&my_snaps[0], my_snaps.size()));
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   char *buf2 = (char *)new char[bsize];
-  memset(buf2, 0xdd, sizeof(buf2));
+  memset(buf2, 0xdd, bsize);
 
-  ASSERT_EQ(bsize, rados_write(ioctx, "foo", buf2, bsize, bsize));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, bsize, bsize));
   rados_ioctx_selfmanaged_snap_rollback(ioctx, "foo", my_snaps[1]);
   char *buf3 = (char *)new char[bsize*2];
   ASSERT_EQ(bsize, rados_read(ioctx, "foo", buf3, bsize*2, 0));
@@ -678,10 +678,10 @@ TEST_F(LibRadosSnapshotsSelfManagedECPP, SnapPP) {
   ::std::reverse(my_snaps.begin(), my_snaps.end());
   int bsize = alignment;
   char *buf = (char *)new char[bsize];
-  memset(buf, 0xcc, sizeof(buf));
+  memset(buf, 0xcc, bsize);
   bufferlist bl1;
   bl1.append(buf, bsize);
-  ASSERT_EQ(bsize, ioctx.write("foo", bl1, bsize, 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, bsize, 0));
 
   my_snaps.push_back(-2);
   ASSERT_EQ(0, ioctx.selfmanaged_snap_create(&my_snaps.back()));
@@ -693,7 +693,7 @@ TEST_F(LibRadosSnapshotsSelfManagedECPP, SnapPP) {
   bufferlist bl2;
   bl2.append(buf2, bsize);
   // Add another aligned buffer
-  ASSERT_EQ(bsize, ioctx.write("foo", bl2, bsize, bsize));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, bsize, bsize));
 
   ioctx.snap_set_read(my_snaps[1]);
   bufferlist bl3;
@@ -728,9 +728,9 @@ TEST_F(LibRadosSnapshotsSelfManagedECPP, RollbackPP) {
   bufferlist bl1;
   bl1.append(buf, bsize);
   //Write 3 consecutive buffers
-  ASSERT_EQ(bsize, ioctx.write("foo", bl1, bsize, 0));
-  ASSERT_EQ(bsize, ioctx.write("foo", bl1, bsize, bsize));
-  ASSERT_EQ(bsize, ioctx.write("foo", bl1, bsize, bsize*2));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, bsize, 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, bsize, bsize));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, bsize, bsize*2));
 
   snap_set_t ss;
 
@@ -752,9 +752,9 @@ TEST_F(LibRadosSnapshotsSelfManagedECPP, RollbackPP) {
   bufferlist bl2;
   bl2.append(buf2, bsize);
   //Change the middle buffer
-  //ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), bufsize));
+  //ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), bufsize));
   //Add another after
-  ASSERT_EQ(bsize, ioctx.write("foo", bl2, bsize, bsize*3));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, bsize, bsize*3));
 
   ASSERT_EQ(-EINVAL, ioctx.list_snaps("foo", &ss));
   ObjectReadOperation o;
diff --git a/src/test/librados/stat.cc b/src/test/librados/stat.cc
index f15edd3..5757b30 100644
--- a/src/test/librados/stat.cc
+++ b/src/test/librados/stat.cc
@@ -17,7 +17,7 @@ typedef RadosTestECPP LibRadosStatECPP;
 TEST_F(LibRadosStat, Stat) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   uint64_t size;
   time_t mtime;
   ASSERT_EQ(0, rados_stat(ioctx, "foo", &size, &mtime));
@@ -30,7 +30,7 @@ TEST_F(LibRadosStatPP, StatPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   uint64_t size;
   time_t mtime;
   ASSERT_EQ(0, ioctx.stat("foo", &size, &mtime));
@@ -42,13 +42,13 @@ TEST_F(LibRadosStat, StatNS) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   rados_ioctx_set_namespace(ioctx, "");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
 
   char buf2[64];
   memset(buf2, 0xcc, sizeof(buf2));
   rados_ioctx_set_namespace(ioctx, "nspace");
-  ASSERT_EQ((int)sizeof(buf2), rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
 
   uint64_t size;
   time_t mtime;
@@ -70,15 +70,15 @@ TEST_F(LibRadosStatPP, StatPPNS) {
   bufferlist bl;
   bl.append(buf, sizeof(buf));
   ioctx.set_namespace("");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo2", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo2", bl, sizeof(buf), 0));
 
   char buf2[64];
   memset(buf2, 0xbb, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   ioctx.set_namespace("nspace");
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), 0));
 
   uint64_t size;
   time_t mtime;
@@ -111,7 +111,7 @@ TEST_F(LibRadosStat, PoolStat) {
   ASSERT_EQ(strlen(actual_pool_name), l);
   ASSERT_EQ(0, strcmp(actual_pool_name, pool_name.c_str()));
   memset(buf, 0xff, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   struct rados_pool_stat_t stats;
   memset(&stats, 0, sizeof(stats));
   ASSERT_EQ(0, rados_ioctx_pool_stat(ioctx, &stats));
@@ -124,7 +124,7 @@ TEST_F(LibRadosStatPP, PoolStatPP) {
   memset(buf, 0xff, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   std::list<std::string> v;
   std::map<std::string,stats_map> stats;
   ASSERT_EQ(0, cluster.get_pool_stats(v, stats));
@@ -133,7 +133,7 @@ TEST_F(LibRadosStatPP, PoolStatPP) {
 TEST_F(LibRadosStatEC, Stat) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   uint64_t size;
   time_t mtime;
   ASSERT_EQ(0, rados_stat(ioctx, "foo", &size, &mtime));
@@ -146,7 +146,7 @@ TEST_F(LibRadosStatECPP, StatPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
   uint64_t size;
   time_t mtime;
   ASSERT_EQ(0, ioctx.stat("foo", &size, &mtime));
@@ -158,13 +158,13 @@ TEST_F(LibRadosStatEC, StatNS) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   rados_ioctx_set_namespace(ioctx, "");
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo2", buf, sizeof(buf), 0));
 
   char buf2[64];
   memset(buf2, 0xcc, sizeof(buf2));
   rados_ioctx_set_namespace(ioctx, "nspace");
-  ASSERT_EQ((int)sizeof(buf2), rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf2, sizeof(buf2), 0));
 
   uint64_t size;
   time_t mtime;
@@ -186,15 +186,15 @@ TEST_F(LibRadosStatECPP, StatPPNS) {
   bufferlist bl;
   bl.append(buf, sizeof(buf));
   ioctx.set_namespace("");
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl, sizeof(buf), 0));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo2", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo2", bl, sizeof(buf), 0));
 
   char buf2[64];
   memset(buf2, 0xbb, sizeof(buf2));
   bufferlist bl2;
   bl2.append(buf2, sizeof(buf2));
   ioctx.set_namespace("nspace");
-  ASSERT_EQ((int)sizeof(buf2), ioctx.write("foo", bl2, sizeof(buf2), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl2, sizeof(buf2), 0));
 
   uint64_t size;
   time_t mtime;
@@ -227,7 +227,7 @@ TEST_F(LibRadosStatEC, PoolStat) {
   ASSERT_EQ(strlen(actual_pool_name), l);
   ASSERT_EQ(0, strcmp(actual_pool_name, pool_name.c_str()));
   memset(buf, 0xff, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   struct rados_pool_stat_t stats;
   memset(&stats, 0, sizeof(stats));
   ASSERT_EQ(0, rados_ioctx_pool_stat(ioctx, &stats));
@@ -240,7 +240,7 @@ TEST_F(LibRadosStatECPP, PoolStatPP) {
   memset(buf, 0xff, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   std::list<std::string> v;
   std::map<std::string,stats_map> stats;
   ASSERT_EQ(0, cluster.get_pool_stats(v, stats));
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 1543cbd..28e8e7a 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -2051,7 +2051,7 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
   for (int i=0; i<1000; ++i) {
     bufferlist bl;
     bl.append("a");
-    ASSERT_EQ(1, ioctx.write(stringify(i), bl, 1, 0));
+    ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
   }
 
   // get HitSets
@@ -2133,7 +2133,7 @@ TEST_F(LibRadosTierPP, HitSetTrim) {
 
     bufferlist bl;
     bl.append("f");
-    ASSERT_EQ(1, ioctx.write("foo", bl, 1, 0));
+    ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
 
     list<pair<time_t, time_t> > ls;
     AioCompletion *c = librados::Rados::aio_create_completion();
@@ -2490,15 +2490,20 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
   // clones in the cache tier)
   // This test requires cache tier and base tier to have the same pg_num/pgp_num
   {
-    IoCtx cache_ioctx;
-    ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
-    ostringstream ss;
-    ss << "{\"prefix\": \"pg scrub\", \"pgid\": \""
-       << cache_ioctx.get_id() << "."
-       << ioctx.get_object_pg_hash_position("foo")
-       << "\"}";
-    ASSERT_EQ(0, cluster.mon_command(ss.str(), inbl, NULL, NULL));
-
+    for (int tries = 0; tries < 5; ++tries) {
+      IoCtx cache_ioctx;
+      ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+      ostringstream ss;
+      ss << "{\"prefix\": \"pg scrub\", \"pgid\": \""
+	 << cache_ioctx.get_id() << "."
+	 << ioctx.get_object_pg_hash_position("foo")
+	 << "\"}";
+      int r = cluster.mon_command(ss.str(), inbl, NULL, NULL);
+      if (r == -EAGAIN)
+	continue;
+      ASSERT_EQ(0, r);
+      break;
+    }
     // give it a few seconds to go.  this is sloppy but is usually enough time
     cout << "waiting for scrub..." << std::endl;
     sleep(15);
@@ -4011,7 +4016,7 @@ TEST_F(LibRadosTierECPP, HitSetWrite) {
   for (int i=0; i<1000; ++i) {
     bufferlist bl;
     bl.append("a");
-    ASSERT_EQ(1, ioctx.write(stringify(i), bl, 1, 0));
+    ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
   }
 
   // get HitSets
@@ -4097,7 +4102,7 @@ TEST_F(LibRadosTierECPP, HitSetTrim) {
 
     bufferlist bl;
     bl.append(buf, bsize);
-    ASSERT_EQ(bsize, ioctx.append("foo", bl, bsize));
+    ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
 
     list<pair<time_t, time_t> > ls;
     AioCompletion *c = librados::Rados::aio_create_completion();
diff --git a/src/test/librados/watch_notify.cc b/src/test/librados/watch_notify.cc
index 44c2a95..d92c1b8 100644
--- a/src/test/librados/watch_notify.cc
+++ b/src/test/librados/watch_notify.cc
@@ -11,7 +11,7 @@
 using namespace librados;
 
 typedef RadosTest LibRadosWatchNotify;
-typedef RadosTestPP LibRadosWatchNotifyPP;
+typedef RadosTestParamPP LibRadosWatchNotifyPP;
 typedef RadosTestEC LibRadosWatchNotifyEC;
 typedef RadosTestECPP LibRadosWatchNotifyECPP;
 
@@ -35,7 +35,7 @@ TEST_F(LibRadosWatchNotify, WatchNotifyTest) {
   ASSERT_EQ(0, sem_init(&sem, 0, 0));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   uint64_t handle;
   ASSERT_EQ(0,
       rados_watch(ioctx, "foo", 0, &handle, watch_notify_test_cb, NULL));
@@ -46,13 +46,13 @@ TEST_F(LibRadosWatchNotify, WatchNotifyTest) {
   sem_destroy(&sem);
 }
 
-TEST_F(LibRadosWatchNotifyPP, WatchNotifyTestPP) {
+TEST_P(LibRadosWatchNotifyPP, WatchNotifyTestPP) {
   ASSERT_EQ(0, sem_init(&sem, 0, 0));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   uint64_t handle;
   WatchNotifyTestCtx ctx;
   ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
@@ -66,8 +66,7 @@ TEST_F(LibRadosWatchNotifyPP, WatchNotifyTestPP) {
   ioctx.unwatch("foo", handle);
   sem_destroy(&sem);
 }
-
-TEST_F(LibRadosWatchNotifyPP, WatchNotifyTimeoutTestPP) {
+TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeoutTestPP) {
   ASSERT_EQ(0, sem_init(&sem, 0, 0));
   ioctx.set_notify_timeout(1);
   uint64_t handle;
@@ -77,7 +76,7 @@ TEST_F(LibRadosWatchNotifyPP, WatchNotifyTimeoutTestPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
   sem_destroy(&sem);
@@ -87,7 +86,7 @@ TEST_F(LibRadosWatchNotifyEC, WatchNotifyTest) {
   ASSERT_EQ(0, sem_init(&sem, 0, 0));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
   uint64_t handle;
   ASSERT_EQ(0,
       rados_watch(ioctx, "foo", 0, &handle, watch_notify_test_cb, NULL));
@@ -104,7 +103,7 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTestPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
   uint64_t handle;
   WatchNotifyTestCtx ctx;
   ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
@@ -129,8 +128,12 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTimeoutTestPP) {
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
   bl1.append(buf, sizeof(buf));
-  ASSERT_EQ((int)sizeof(buf), ioctx.write("foo", bl1, sizeof(buf), 0));
+  ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
   sem_destroy(&sem);
 }
+
+
+INSTANTIATE_TEST_CASE_P(LibRadosWatchNotifyPPTests, LibRadosWatchNotifyPP,
+			::testing::Values("", "cache"));
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index d0b9c99..7f35418 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -1777,6 +1777,88 @@ TEST(LibRBD, DiffIterateStress)
   ASSERT_EQ(0, destroy_one_pool_pp(pool_name, rados));
 }
 
+TEST(LibRBD, ZeroLengthWrite)
+{
+  rados_t cluster;
+  rados_ioctx_t ioctx;
+  string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool(pool_name, &cluster));
+  rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+
+  rbd_image_t image;
+  int order = 0;
+  const char *name = "testimg";
+  uint64_t size = 2 << 20;
+
+  ASSERT_EQ(0, create_image(ioctx, name, size, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
+
+  char read_data[1];
+  ASSERT_EQ(0, rbd_write(image, 0, 0, NULL));
+  ASSERT_EQ(1, rbd_read(image, 0, 1, read_data));
+  ASSERT_EQ('\0', read_data[0]);
+
+  ASSERT_EQ(0, rbd_close(image));
+
+  rados_ioctx_destroy(ioctx);
+  ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
+}
+
+
+TEST(LibRBD, ZeroLengthDiscard)
+{
+  rados_t cluster;
+  rados_ioctx_t ioctx;
+  string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool(pool_name, &cluster));
+  rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+
+  rbd_image_t image;
+  int order = 0;
+  const char *name = "testimg";
+  uint64_t size = 2 << 20;
+
+  ASSERT_EQ(0, create_image(ioctx, name, size, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
+
+  const char *data = "blah";
+  char read_data[strlen(data)];
+  ASSERT_EQ((int)strlen(data), rbd_write(image, 0, strlen(data), data));
+  ASSERT_EQ(0, rbd_discard(image, 0, 0));
+  ASSERT_EQ((int)strlen(data), rbd_read(image, 0, strlen(data), read_data));
+  ASSERT_EQ(0, memcmp(data, read_data, strlen(data)));
+
+  ASSERT_EQ(0, rbd_close(image));
+
+  rados_ioctx_destroy(ioctx);
+  ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
+}
+
+TEST(LibRBD, ZeroLengthRead)
+{
+  rados_t cluster;
+  rados_ioctx_t ioctx;
+  string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool(pool_name, &cluster));
+  rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+
+  rbd_image_t image;
+  int order = 0;
+  const char *name = "testimg";
+  uint64_t size = 2 << 20;
+
+  ASSERT_EQ(0, create_image(ioctx, name, size, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
+
+  char read_data[1];
+  ASSERT_EQ(0, rbd_read(image, 0, 0, read_data));
+
+  ASSERT_EQ(0, rbd_close(image));
+
+  rados_ioctx_destroy(ioctx);
+  ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
+}
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index 5bb8d2e..e81699d 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -138,6 +138,7 @@ public:
 
   void begin();
   void finish(CallbackInfo *info);
+  virtual bool must_quiesce_other_ops() { return false; }
 };
 
 class TestOpGenerator {
@@ -175,13 +176,16 @@ public:
   const uint64_t max_stride_size;
   AttrGenerator attr_gen;
   const bool no_omap;
-	
+  bool pool_snaps;
+  int snapname_num;
+
   RadosTestContext(const string &pool_name, 
 		   int max_in_flight,
 		   uint64_t max_size,
 		   uint64_t min_stride_size,
 		   uint64_t max_stride_size,
 		   bool no_omap,
+		   bool pool_snaps,
 		   const char *id = 0) :
     state_lock("Context Lock"),
     pool_obj_cont(),
@@ -195,7 +199,9 @@ public:
     max_size(max_size), 
     min_stride_size(min_stride_size), max_stride_size(max_stride_size),
     attr_gen(2000),
-    no_omap(no_omap)
+    no_omap(no_omap),
+    pool_snaps(pool_snaps),
+    snapname_num(0)
   {
   }
 
@@ -242,7 +248,13 @@ public:
     state_lock.Lock();
 
     TestOp *next = gen->next(*this);
+    TestOp *waiting = NULL;
+
     while (next || !inflight.empty()) {
+      if (next && next->must_quiesce_other_ops() && !inflight.empty()) {
+	waiting = next;
+	next = NULL;   // Force to wait for inflight to drain
+      }
       if (next) {
 	inflight.push_back(next);
       }
@@ -270,7 +282,12 @@ public:
 	  break;
 	}
       }
-      next = gen->next(*this);
+      if (waiting) {
+	next = waiting;
+	waiting = NULL;
+      } else {
+	next = gen->next(*this);
+      }
     }
     state_lock.Unlock();
   }
@@ -728,7 +745,6 @@ public:
     cont_gen->get_ranges_map(cont, ranges);
     std::cout << num << ":  seq_num " << context->seq_num << " ranges " << ranges << std::endl;
     context->seq_num++;
-    context->state_lock.Unlock();
 
     waiting_on = ranges.size();
     //cout << " waiting_on = " << waiting_on << std::endl;
@@ -795,6 +811,7 @@ public:
       &read_op,
       librados::OPERATION_ORDER_READS_WRITES,  // order wrt previous write/update
       0);
+    context->state_lock.Unlock();
   }
 
   void _finish(CallbackInfo *info)
@@ -1160,24 +1177,47 @@ public:
   void _begin()
   {
     uint64_t snap;
-    assert(!context->io_ctx.selfmanaged_snap_create(&snap));
+    string snapname;
+
+    if (context->pool_snaps) {
+      stringstream ss;
+
+      ss << context->prefix << "snap" << ++context->snapname_num;
+      snapname = ss.str();
+
+      int ret = context->io_ctx.snap_create(snapname.c_str());
+      if (ret) {
+	cerr << "snap_create returned " << ret << std::endl;
+	assert(0);
+      }
+      assert(!context->io_ctx.snap_lookup(snapname.c_str(), &snap));
+
+    } else {
+      assert(!context->io_ctx.selfmanaged_snap_create(&snap));
+    }
 
     context->state_lock.Lock();
     context->add_snap(snap);
 
-    vector<uint64_t> snapset(context->snaps.size());
-    int j = 0;
-    for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
-	 i != context->snaps.rend();
-	 ++i, ++j) {
-      snapset[j] = i->second;
-    }
+    if (context->pool_snaps) {
+      context->state_lock.Unlock();
+    } else {
+      vector<uint64_t> snapset(context->snaps.size());
 
-    context->state_lock.Unlock();
-    int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
-    if (r) {
-      cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
-      assert(0);
+      int j = 0;
+      for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
+	   i != context->snaps.rend();
+	   ++i, ++j) {
+	snapset[j] = i->second;
+      }
+
+      context->state_lock.Unlock();
+
+      int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
+      if (r) {
+	cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
+	assert(0);
+      }
     }
   }
 
@@ -1185,6 +1225,7 @@ public:
   {
     return "SnapCreateOp";
   }
+  bool must_quiesce_other_ops() { return context->pool_snaps; }
 };
 
 class SnapRemoveOp : public TestOp {
@@ -1204,20 +1245,27 @@ public:
     context->remove_snap(to_remove);
     context->state_lock.Unlock();
 
-    assert(!context->io_ctx.selfmanaged_snap_remove(snap));
+    if (context->pool_snaps) {
+      string snapname;
 
-    vector<uint64_t> snapset(context->snaps.size());
-    int j = 0;
-    for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
-	 i != context->snaps.rend();
-	 ++i, ++j) {
-      snapset[j] = i->second;
-    }
+      assert(!context->io_ctx.snap_get_name(snap, &snapname));
+      assert(!context->io_ctx.snap_remove(snapname.c_str()));
+     } else {
+      assert(!context->io_ctx.selfmanaged_snap_remove(snap));
 
-    int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
-    if (r) {
-      cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
-      assert(0);
+      vector<uint64_t> snapset(context->snaps.size());
+      int j = 0;
+      for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
+	   i != context->snaps.rend();
+	   ++i, ++j) {
+	snapset[j] = i->second;
+      }
+
+      int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
+      if (r) {
+	cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
+	assert(0);
+      }
     }
   }
 
@@ -1341,7 +1389,11 @@ public:
 
     context->state_lock.Unlock();
 
-    op.selfmanaged_snap_rollback(snap);
+    if (context->pool_snaps) {
+      op.snap_rollback(snap);
+    } else {
+      op.selfmanaged_snap_rollback(snap);
+    }
 
     pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
       new pair<TestOp*, TestOp::CallbackInfo*>(this,
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index 15c8ff7..b61ceb3 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -233,6 +233,7 @@ int main(int argc, char **argv)
   int64_t size = 4000000; // 4 MB
   int64_t min_stride_size = -1, max_stride_size = -1;
   int max_seconds = 0;
+  bool pool_snaps = false;
 
   struct {
     TestOpType op;
@@ -283,6 +284,8 @@ int main(int argc, char **argv)
       max_stride_size = atoi(argv[++i]);
     else if (strcmp(argv[i], "--no-omap") == 0)
       no_omap = true;
+    else if (strcmp(argv[i], "--pool-snaps") == 0)
+      pool_snaps = true;
     else if (strcmp(argv[i], "--ec-pool") == 0) {
       if (!op_weights.empty()) {
 	cerr << "--ec-pool must be specified prior to any ops" << std::endl;
@@ -292,6 +295,10 @@ int main(int argc, char **argv)
       no_omap = true;
     } else if (strcmp(argv[i], "--op") == 0) {
       i++;
+      if (i == argc) {
+        cerr << "Missing op after --op" << std::endl;
+        return 1;
+      }
       int j;
       for (j = 0; op_types[j].name; ++j) {
 	if (strcmp(op_types[j].name, argv[i]) == 0) {
@@ -302,7 +309,12 @@ int main(int argc, char **argv)
 	cerr << "unknown op " << argv[i] << std::endl;
 	exit(1);
       }
-      int weight = atoi(argv[++i]);
+      i++;
+      if (i == argc) {
+	cerr << "Weight unspecified." << std::endl;
+	return 1;
+      }
+      int weight = atoi(argv[i]);
       if (weight < 0) {
 	cerr << "Weights must be nonnegative." << std::endl;
 	return 1;
@@ -368,6 +380,7 @@ int main(int argc, char **argv)
     min_stride_size,
     max_stride_size,
     no_omap,
+    pool_snaps,
     id);
 
   TestOpStat stats;
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 6380211..a04f2cb 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -155,6 +155,8 @@ TEST(pg_interval_t, check_new_interval)
   int new_primary = osd_id;
   vector<int> new_up;
   new_up.push_back(osd_id);
+  int old_up_primary = osd_id;
+  int new_up_primary = osd_id;
   vector<int> old_up = new_up;
   pg_t pgid;
   pgid.set_pool(pool_id);
@@ -172,6 +174,8 @@ TEST(pg_interval_t, check_new_interval)
 						   new_primary,
 						   old_acting,
 						   new_acting,
+						   old_up_primary,
+						   new_up_primary,
 						   old_up,
 						   new_up,
 						   same_interval_since,
@@ -200,6 +204,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -231,6 +237,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -263,6 +271,40 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
+						  old_up,
+						  new_up,
+						  same_interval_since,
+						  last_epoch_clean,
+						  osdmap,
+						  lastmap,
+						  pool_id,
+						  pgid,
+						  &past_intervals));
+    ASSERT_EQ((unsigned int)1, past_intervals.size());
+    ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
+    ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
+    ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
+    ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
+  }
+
+  //
+  // The up primary has changed
+  //
+  {
+    vector<int> new_up;
+    int _new_up_primary = osd_id + 1;
+
+    map<epoch_t, pg_interval_t> past_intervals;
+
+    ASSERT_TRUE(past_intervals.empty());
+    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+						  new_primary,
+						  old_acting,
+						  new_acting,
+						  old_up_primary,
+						  _new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -300,6 +342,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -337,6 +381,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -369,6 +415,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -419,6 +467,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -452,6 +502,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -495,6 +547,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -542,6 +596,8 @@ TEST(pg_interval_t, check_new_interval)
 						  new_primary,
 						  old_acting,
 						  new_acting,
+						  old_up_primary,
+						  new_up_primary,
 						  old_up,
 						  new_up,
 						  same_interval_since,
@@ -558,6 +614,18 @@ TEST(pg_interval_t, check_new_interval)
   }
 }
 
+TEST(pg_t, get_ancestor)
+{
+  ASSERT_EQ(pg_t(0, 0, -1), pg_t(16, 0, -1).get_ancestor(16));
+  ASSERT_EQ(pg_t(1, 0, -1), pg_t(17, 0, -1).get_ancestor(16));
+  ASSERT_EQ(pg_t(0, 0, -1), pg_t(16, 0, -1).get_ancestor(8));
+  ASSERT_EQ(pg_t(16, 0, -1), pg_t(16, 0, -1).get_ancestor(80));
+  ASSERT_EQ(pg_t(16, 0, -1), pg_t(16, 0, -1).get_ancestor(83));
+  ASSERT_EQ(pg_t(1, 0, -1), pg_t(1321, 0, -1).get_ancestor(123).get_ancestor(8));
+  ASSERT_EQ(pg_t(3, 0, -1), pg_t(1323, 0, -1).get_ancestor(123).get_ancestor(8));
+  ASSERT_EQ(pg_t(3, 0, -1), pg_t(1323, 0, -1).get_ancestor(8));
+}
+
 TEST(pg_t, split)
 {
   pg_t pgid(0, 0, -1);
diff --git a/src/test/system/rados_list_parallel.cc b/src/test/system/rados_list_parallel.cc
index 5fc4bfb..7af86e4 100644
--- a/src/test/system/rados_list_parallel.cc
+++ b/src/test/system/rados_list_parallel.cc
@@ -178,7 +178,7 @@ public:
 
       std::string buf(StRadosCreatePool::get_random_buf(256));
       int ret = rados_write(io_ctx, oid.c_str(), buf.c_str(), buf.size(), 0);
-      if (ret != (int)buf.size()) {
+      if (ret != 0) {
 	printf("%s: rados_write(%s) failed with error %d\n",
 	       get_id_str(), oid.c_str(), ret);
 	return ret;
diff --git a/src/test/system/st_rados_create_pool.cc b/src/test/system/st_rados_create_pool.cc
index 29b8d14..6bbe628 100644
--- a/src/test/system/st_rados_create_pool.cc
+++ b/src/test/system/st_rados_create_pool.cc
@@ -91,7 +91,7 @@ run()
     snprintf(oid, sizeof(oid), "%d%s", i, m_suffix.c_str());
     std::string buf(get_random_buf(256));
     int ret = rados_write(io_ctx, oid, buf.c_str(), buf.size(), 0);
-    if (ret < static_cast<int>(buf.size())) {
+    if (ret != 0) {
       printf("%s: rados_write error %d\n", get_id_str(), ret);
       return ret;
     }
diff --git a/src/test/test_stress_watch.cc b/src/test/test_stress_watch.cc
index 1b8f190..52b480b 100644
--- a/src/test/test_stress_watch.cc
+++ b/src/test/test_stress_watch.cc
@@ -14,6 +14,8 @@
 #include <iostream>
 #include <string>
 
+#include "test/librados/TestCase.h"
+
 
 using namespace librados;
 using ceph::buffer;
@@ -54,7 +56,13 @@ struct WatcherUnwatcher : public Thread {
     return NULL;
   }
 };
-TEST(WatchStress, Stress1) {
+
+typedef RadosTestParamPP WatchStress;
+
+INSTANTIATE_TEST_CASE_P(WatchStressTests, WatchStress,
+			::testing::Values("", "cache"));
+
+TEST_P(WatchStress, Stress1) {
   ASSERT_EQ(0, sem_init(&sem, 0, 0));
   Rados ncluster;
   std::string pool_name = get_temp_pool_name();
diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc
index 8f294c4..231d257 100644
--- a/src/tools/ceph_monstore_tool.cc
+++ b/src/tools/ceph_monstore_tool.cc
@@ -34,6 +34,7 @@
 #include "mon/Paxos.h"
 #include "common/Formatter.h"
 #include "include/stringify.h"
+#include "common/errno.h"
 
 namespace po = boost::program_options;
 using namespace std;
diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc
index bde4b28..f368a4b 100644
--- a/src/tools/ceph_osdomap_tool.cc
+++ b/src/tools/ceph_osdomap_tool.cc
@@ -32,6 +32,7 @@
 #include "os/LevelDBStore.h"
 #include "mon/MonitorDBStore.h"
 #include "os/DBObjectMap.h"
+#include "common/errno.h"
 
 namespace po = boost::program_options;
 using namespace std;
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index 0c6d7ba..8dcd79c 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -586,7 +586,7 @@ int main(int argc, const char **argv)
 	int id;
 	int r = crush_add_bucket(crush.crush, 0, b, &id);
 	if (r < 0) {
-	  dout(2) << "Couldn't add bucket: " << strerror(-r) << dendl;
+	  dout(2) << "Couldn't add bucket: " << cpp_strerror(r) << dendl;
 	}
 
 	char format[20];
@@ -722,8 +722,7 @@ int main(int argc, const char **argv)
       crush.encode(bl);
       int r = bl.write_file(outfn.c_str());
       if (r < 0) {
-	char buf[80];
-	cerr << me << ": error writing '" << outfn << "': " << strerror_r(-r, buf, sizeof(buf)) << std::endl;
+	cerr << me << ": error writing '" << outfn << "': " << cpp_strerror(r) << std::endl;
 	exit(1);
       }
       if (verbose)
diff --git a/src/tools/monmaptool.cc b/src/tools/monmaptool.cc
index 57843aa..f11858c 100644
--- a/src/tools/monmaptool.cc
+++ b/src/tools/monmaptool.cc
@@ -23,6 +23,7 @@ using namespace std;
 
 #include "common/config.h"
 #include "common/ceph_argparse.h"
+#include "common/errno.h"
 #include "global/global_init.h"
 #include "mon/MonMap.h"
 #include "include/str_list.h"
@@ -115,9 +116,8 @@ int main(int argc, const char **argv)
     }
   }
 
-  char buf[80];
   if (!create && r < 0) {
-    cerr << me << ": couldn't open " << fn << ": " << strerror_r(-r, buf, sizeof(buf)) << std::endl;
+    cerr << me << ": couldn't open " << fn << ": " << cpp_strerror(r) << std::endl;
     return -1;
   }    
   else if (create && !clobber && r == 0) {
@@ -198,7 +198,7 @@ int main(int argc, const char **argv)
 	 << std::endl;
     int r = monmap.write(fn.c_str());
     if (r < 0) {
-      cerr << "monmaptool: error writing to '" << fn << "': " << strerror_r(-r, buf, sizeof(buf)) << std::endl;
+      cerr << "monmaptool: error writing to '" << fn << "': " << cpp_strerror(r) << std::endl;
       return 1;
     }
   }
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index caed62b..5b9297a 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -86,10 +86,13 @@ void usage(ostream& out)
 "   rollback <obj-name> <snap-name>  roll back object to snap <snap-name>\n"
 "\n"
 "   listsnaps <obj-name>             list the snapshots of this object\n"
-"   bench <seconds> write|seq|rand [-t concurrent_operations] [--no-cleanup]\n"
+"   bench <seconds> write|seq|rand [-t concurrent_operations] [--no-cleanup] [--run-name run_name]\n"
 "                                    default is 16 concurrent IOs and 4 MB ops\n"
 "                                    default is to clean up after write benchmark\n"
-"   cleanup <prefix>                 clean up a previous benchmark operation\n"
+"                                    default run-name is 'benchmark_last_metadata'\n"
+"   cleanup [--run-name run_name] [--prefix prefix]\n"
+"                                    clean up a previous benchmark operation\n"
+"                                    default run-name is 'benchmark_last_metadata'\n"
 "   load-gen [options]               generate load on the cluster\n"
 "   listomapkeys <obj-name>          list the keys in the object map\n"
 "   listomapvals <obj-name>          list the keys and vals in the object map \n"
@@ -391,8 +394,7 @@ static int do_copy_pool(Rados& rados, const char *src_pool, const char *target_p
     target_ctx.locator_set_key(locator);
     ret = do_copy(src_ctx, oid.c_str(), target_ctx, oid.c_str());
     if (ret < 0) {
-      char buf[64];
-      cerr << "error copying object: " << strerror_r(errno, buf, sizeof(buf)) << std::endl;
+      cerr << "error copying object: " << cpp_strerror(errno) << std::endl;
       return ret;
     }
   }
@@ -413,8 +415,7 @@ static int do_put(IoCtx& io_ctx, const char *objname, const char *infile, int op
   if (!stdio)
     fd = open(infile, O_RDONLY);
   if (fd < 0) {
-    char buf[80];
-    cerr << "error reading input file " << infile << ": " << strerror_r(errno, buf, sizeof(buf)) << std::endl;
+    cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl;
     return 1;
   }
   char *buf = new char[op_size];
@@ -479,7 +480,7 @@ int gen_rand_alphanumeric(char *dest, int size) /* size should be the required s
 {
   int ret = get_random_bytes(dest, size);
   if (ret < 0) {
-    cerr << "cannot get random bytes: " << cpp_strerror(-ret) << std::endl;
+    cerr << "cannot get random bytes: " << cpp_strerror(ret) << std::endl;
     return -1;
   }
 
@@ -633,7 +634,7 @@ int LoadGen::bootstrap(const char *pool)
 
   int ret = rados->ioctx_create(pool, io_ctx);
   if (ret < 0) {
-    cerr << "error opening pool " << pool << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+    cerr << "error opening pool " << pool << ": " << cpp_strerror(ret) << std::endl;
     return ret;
   }
 
@@ -923,8 +924,6 @@ static int do_lock_cmd(std::vector<const char*> &nargs,
                        IoCtx *ioctx,
 		       Formatter *formatter)
 {
-  char buf[128];
-
   if (nargs.size() < 3)
     usage_exit();
 
@@ -971,7 +970,7 @@ static int do_lock_cmd(std::vector<const char*> &nargs,
     list<string> locks;
     int ret = rados::cls::lock::list_locks(ioctx, oid, &locks);
     if (ret < 0) {
-      cerr << "ERROR: rados_list_locks(): " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "ERROR: rados_list_locks(): " << cpp_strerror(ret) << std::endl;
       return ret;
     }
 
@@ -1001,7 +1000,7 @@ static int do_lock_cmd(std::vector<const char*> &nargs,
     string tag;
     int ret = rados::cls::lock::get_lock_info(ioctx, oid, lock_name, &lockers, &type, &tag);
     if (ret < 0) {
-      cerr << "ERROR: rados_lock_get_lock_info(): " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "ERROR: rados_lock_get_lock_info(): " << cpp_strerror(ret) << std::endl;
       return ret;
     }
 
@@ -1042,7 +1041,7 @@ static int do_lock_cmd(std::vector<const char*> &nargs,
       ret = l.lock_exclusive(ioctx, oid);
     }
     if (ret < 0) {
-      cerr << "ERROR: failed locking: " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "ERROR: failed locking: " << cpp_strerror(ret) << std::endl;
       return ret;
     }
 
@@ -1064,7 +1063,7 @@ static int do_lock_cmd(std::vector<const char*> &nargs,
     }
     int ret = l.break_lock(ioctx, oid, name);
     if (ret < 0) {
-      cerr << "ERROR: failed breaking lock: " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "ERROR: failed breaking lock: " << cpp_strerror(ret) << std::endl;
       return ret;
     }
   } else {
@@ -1196,6 +1195,9 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
   bool show_time = false;
 
+  const char* run_name = NULL;
+  const char* prefix = NULL;
+
   Formatter *formatter = NULL;
   bool pretty_format = false;
 
@@ -1230,6 +1232,14 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   if (i != opts.end()) {
     concurrent_ios = strtol(i->second.c_str(), NULL, 10);
   }
+  i = opts.find("run-name");
+  if (i != opts.end()) {
+    run_name = i->second.c_str();
+  }
+  i = opts.find("prefix");
+  if (i != opts.end()) {
+    prefix = i->second.c_str();
+  }
   i = opts.find("block-size");
   if (i != opts.end()) {
     op_size = strtol(i->second.c_str(), NULL, 10);
@@ -1326,7 +1336,6 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
      ret = -1;
      goto out;
   }
-  char buf[80];
 
   if (create_pool && !pool_name) {
     cerr << "--create-pool requested but pool_name was not specified!" << std::endl;
@@ -1337,7 +1346,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = rados.pool_create(pool_name, 0, 0);
     if (ret < 0) {
       cerr << "error creating pool " << pool_name << ": "
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1347,7 +1356,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = rados.ioctx_create(pool_name, io_ctx);
     if (ret < 0) {
       cerr << "error opening pool " << pool_name << ": "
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1356,7 +1365,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   if (snapname) {
     ret = io_ctx.snap_lookup(snapname, &snapid);
     if (ret < 0) {
-      cerr << "error looking up snap '" << snapname << "': " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error looking up snap '" << snapname << "': " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1550,7 +1559,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = io_ctx.set_auid(new_auid);
     if (ret < 0) {
       cerr << "error changing auid on pool " << io_ctx.get_pool_name() << ':'
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
     } else cerr << "changed auid on pool " << io_ctx.get_pool_name()
 		<< " to " << new_auid << std::endl;
   }
@@ -1578,7 +1587,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = io_ctx.stat(oid, &size, &mtime);
     if (ret < 0) {
       cerr << " error stat-ing " << pool_name << "/" << oid << ": "
-           << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+           << cpp_strerror(ret) << std::endl;
       goto out;
     } else {
       cout << pool_name << "/" << oid
@@ -1590,7 +1599,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       usage_exit();
     ret = do_get(io_ctx, nargs[1], nargs[2], op_size);
     if (ret < 0) {
-      cerr << "error getting " << pool_name << "/" << nargs[1] << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error getting " << pool_name << "/" << nargs[1] << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1599,7 +1608,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       usage_exit();
     ret = do_put(io_ctx, nargs[1], nargs[2], op_size);
     if (ret < 0) {
-      cerr << "error putting " << pool_name << "/" << nargs[1] << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error putting " << pool_name << "/" << nargs[1] << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1617,7 +1626,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     if (ret < 0) {
       cerr << "error truncating oid "
 	   << oid << " to " << size << ": "
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
     } else {
       ret = 0;
     }
@@ -1635,7 +1644,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
     ret = io_ctx.setxattr(oid, attr_name.c_str(), bl);
     if (ret < 0) {
-      cerr << "error setting xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error setting xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     else
@@ -1651,7 +1660,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     bufferlist bl;
     ret = io_ctx.getxattr(oid, attr_name.c_str(), bl);
     if (ret < 0) {
-      cerr << "error getting xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error getting xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     else
@@ -1667,7 +1676,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
     ret = io_ctx.rmxattr(oid, attr_name.c_str());
     if (ret < 0) {
-      cerr << "error removing xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error removing xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   } else if (strcmp(nargs[0], "listxattr") == 0) {
@@ -1679,7 +1688,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     bufferlist bl;
     ret = io_ctx.getxattrs(oid, attrset);
     if (ret < 0) {
-      cerr << "error getting xattr set " << pool_name << "/" << oid << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error getting xattr set " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
 
@@ -1875,7 +1884,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = rados.ioctx_create(target, target_ctx);
     if (ret < 0) {
       cerr << "error opening target pool " << target << ": "
-           << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+           << cpp_strerror(ret) << std::endl;
       goto out;
     }
     if (target_oloc.size()) {
@@ -1884,7 +1893,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
     ret = do_copy(io_ctx, nargs[1], target_ctx, target_obj);
     if (ret < 0) {
-      cerr << "error copying " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error copying " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1916,7 +1925,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = rados.ioctx_create(target, target_ctx);
     if (ret < 0) {
       cerr << "error opening target pool " << target << ": "
-           << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+           << cpp_strerror(ret) << std::endl;
       goto out;
     }
     if (oloc.size()) {
@@ -1929,7 +1938,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
     ret = do_clone_data(io_ctx, nargs[1], target_ctx, target_obj);
     if (ret < 0) {
-      cerr << "error cloning " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error cloning " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   } else if (strcmp(nargs[0], "rm") == 0) {
@@ -1941,7 +1950,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       const string & oid = *iter;
       ret = io_ctx.remove(oid);
       if (ret < 0) {
-        cerr << "error removing " << pool_name << "/" << oid << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+        cerr << "error removing " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
         goto out;
       }
     }
@@ -1957,7 +1966,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       ret = io_ctx.create(oid, true);
     }
     if (ret < 0) {
-      cerr << "error creating " << pool_name << "/" << oid << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error creating " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
   }
@@ -1970,7 +1979,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       string oid(nargs[2]);
       ret = io_ctx.read(oid, outdata, 0, 0);
       if (ret < 0) {
-	cerr << "error reading " << pool_name << "/" << oid << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	cerr << "error reading " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
 	goto out;
       }
       bufferlist::iterator p = outdata.begin();
@@ -2071,7 +2080,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = rados.pool_create(nargs[1], auid, crush_rule);
     if (ret < 0) {
       cerr << "error creating pool " << nargs[1] << ": "
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
       goto out;
     }
     cout << "successfully created pool " << nargs[1] << std::endl;
@@ -2091,7 +2100,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = do_copy_pool(rados, src_pool, target_pool);
     if (ret < 0) {
       cerr << "error copying pool " << src_pool << " => " << target_pool << ": "
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
       goto out;
     }
     cout << "successfully copied pool " << nargs[1] << std::endl;
@@ -2157,7 +2166,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = io_ctx.snap_create(nargs[1]);
     if (ret < 0) {
       cerr << "error creating pool " << pool_name << " snapshot " << nargs[1]
-	   << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     cout << "created pool " << pool_name << " snap " << nargs[1] << std::endl;
@@ -2170,7 +2179,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = io_ctx.snap_remove(nargs[1]);
     if (ret < 0) {
       cerr << "error removing pool " << pool_name << " snapshot " << nargs[1]
-	   << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     cout << "removed pool " << pool_name << " snap " << nargs[1] << std::endl;
@@ -2183,7 +2192,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = io_ctx.rollback(nargs[1], nargs[2]);
     if (ret < 0) {
       cerr << "error rolling back pool " << pool_name << " to snapshot " << nargs[1]
-	   << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+	   << cpp_strerror(ret) << std::endl;
       goto out;
     }
     cout << "rolled back pool " << pool_name
@@ -2205,16 +2214,15 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     RadosBencher bencher(g_ceph_context, rados, io_ctx);
     bencher.set_show_time(show_time);
     ret = bencher.aio_bench(operation, seconds, num_objs,
-			    concurrent_ios, op_size, cleanup);
+			    concurrent_ios, op_size, cleanup, run_name);
     if (ret != 0)
       cerr << "error during benchmark: " << ret << std::endl;
   }
   else if (strcmp(nargs[0], "cleanup") == 0) {
-    if (!pool_name || nargs.size() < 2)
+    if (!pool_name)
       usage_exit();
-    const char *prefix = nargs[1];
     RadosBencher bencher(g_ceph_context, rados, io_ctx);
-    ret = bencher.clean_up(prefix, concurrent_ios);
+    ret = bencher.clean_up(prefix, concurrent_ios, run_name);
     if (ret != 0)
       cerr << "error during cleanup: " << ret << std::endl;
   }
@@ -2260,7 +2268,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ret = io_ctx.set_alloc_hint(oid, expected_object_size, expected_write_size);
     if (ret < 0) {
       cerr << "error setting alloc-hint " << pool_name << "/" << oid << ": "
-           << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+           << cpp_strerror(ret) << std::endl;
       goto out;
     }
   } else if (strcmp(nargs[0], "load-gen") == 0) {
@@ -2335,7 +2343,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 
     ret = io_ctx.list_watchers(oid, &lw);
     if (ret < 0) {
-      cerr << "error listing watchers " << pool_name << "/" << oid << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error listing watchers " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     else
@@ -2354,7 +2362,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
     ret = io_ctx.list_snaps(oid, &ls);
     if (ret < 0) {
-      cerr << "error listing snap shots " << pool_name << "/" << oid << ": " << strerror_r(-ret, buf, sizeof(buf)) << std::endl;
+      cerr << "error listing snap shots " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     else
@@ -2562,6 +2570,10 @@ int main(int argc, const char **argv)
       opts["show-time"] = "true";
     } else if (ceph_argparse_flag(args, i, "--no-cleanup", (char*)NULL)) {
       opts["no-cleanup"] = "true";
+    } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) {
+      opts["run-name"] = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
+      opts["prefix"] = val;
     } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
       opts["pool"] = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--target-pool", (char*)NULL)) {
diff --git a/src/tools/rest_bench.cc b/src/tools/rest_bench.cc
index feea4de..d617fe1 100644
--- a/src/tools/rest_bench.cc
+++ b/src/tools/rest_bench.cc
@@ -36,7 +36,7 @@ void usage(ostream& out)
 {
   out <<					\
 "usage: rest-bench [options] <write|seq>\n"
-"       rest-bench [options] cleanup <prefix>\n"
+"       rest-bench [options] cleanup [--run-name run_name] [--prefix prefix]\n"
 "BENCHMARK OPTIONS\n"
 "   --seconds\n"
 "        benchmak length (default: 60)\n"
@@ -678,6 +678,8 @@ int main(int argc, const char **argv)
 
   bool show_time = false;
   bool cleanup = true;
+  std::string run_name;
+  std::string prefix;
 
 
   for (i = args.begin(); i != args.end(); ) {
@@ -722,6 +724,10 @@ int main(int argc, const char **argv)
       }
     } else if (ceph_argparse_witharg(args, i, &val, "-t", "--concurrent-ios", (char*)NULL)) {
       concurrent_ios = strtol(val.c_str(), NULL, 10);
+    } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) {
+      run_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
+      prefix = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--seconds", (char*)NULL)) {
       seconds = strtol(val.c_str(), NULL, 10);
     } else if (ceph_argparse_witharg(args, i, &val, "-b", "--block-size", (char*)NULL)) {
@@ -740,7 +746,6 @@ int main(int argc, const char **argv)
   if (args.empty())
     usage_exit();
   int operation = 0;
-  const char *prefix = NULL;
   if (strcmp(args[0], "write") == 0)
     operation = OP_WRITE;
   else if (strcmp(args[0], "seq") == 0)
@@ -748,10 +753,7 @@ int main(int argc, const char **argv)
   else if (strcmp(args[0], "rand") == 0)
     operation = OP_RAND_READ;
   else if (strcmp(args[0], "cleanup") == 0) {
-    if (args.size() < 2)
-      usage_exit();
     operation = OP_CLEANUP;
-    prefix = args[1];
   } else
     usage_exit();
 
@@ -784,12 +786,12 @@ int main(int argc, const char **argv)
   }
 
   if (operation == OP_CLEANUP) {
-    ret = bencher.clean_up(prefix, concurrent_ios);
+    ret = bencher.clean_up(prefix.c_str(), concurrent_ios, run_name.c_str());
     if (ret != 0)
       cerr << "error during cleanup: " << ret << std::endl;
   } else {
     ret = bencher.aio_bench(operation, seconds, 0,
-			    concurrent_ios, op_size, cleanup);
+			    concurrent_ios, op_size, cleanup, run_name.c_str());
     if (ret != 0) {
         cerr << "error during benchmark: " << ret << std::endl;
     }
diff --git a/src/tools/scratchtoolpp.cc b/src/tools/scratchtoolpp.cc
index 87074a6..0444cca 100644
--- a/src/tools/scratchtoolpp.cc
+++ b/src/tools/scratchtoolpp.cc
@@ -116,7 +116,7 @@ int main(int argc, const char **argv)
   cout << "io_ctx.stat returned " << r << " size = " << stat_size << " mtime = " << stat_mtime << std::endl;
 
   r = io_ctx.stat(oid, NULL, NULL);
-  cout << "io_ctx.stat(does_not_exist) = " << r;
+  cout << "io_ctx.stat(does_not_exist) = " << r << std::endl;
 
   uint64_t handle;
   C_Watch wc;
@@ -136,12 +136,10 @@ int main(int argc, const char **argv)
 
   r = io_ctx.unwatch(oid, handle);
   cout << "io_ctx.unwatch returned " << r << std::endl;
-  cout << "*** press enter to continue ***" << std::endl;
   testradospp_milestone();
 
   r = io_ctx.notify(oid, objver, notify_bl);
   cout << "io_ctx.notify returned " << r << std::endl;
-  cout << "*** press enter to continue ***" << std::endl;
   testradospp_milestone();
   io_ctx.set_assert_version(objver);
 
@@ -186,20 +184,11 @@ int main(int argc, const char **argv)
   assert(r == -EOVERFLOW);
 
   // test assert_src_version
-  const char *dest = "baz";
   r = io_ctx.read(oid, bl, 0, 1);
   assert(r >= 0);
   v = io_ctx.get_last_version();
   cout << oid << " version is " << v << std::endl;
   io_ctx.set_assert_src_version(oid, v);
-  r = io_ctx.clone_range(dest, 0, oid, 0, 1);
-  assert(r >= 0);
-  io_ctx.set_assert_src_version(oid, v-1);
-  r = io_ctx.clone_range(dest, 0, oid, 0, 1);
-  assert(r == -ERANGE);
-  io_ctx.set_assert_src_version(oid, v+1);
-  r = io_ctx.clone_range(dest, 0, oid, 0, 1);
-  assert(r == -EOVERFLOW);
   
   r = io_ctx.exec(oid, "crypto", "sha1", bl, bl2);
   cout << "exec returned " << r << std::endl;
@@ -267,7 +256,7 @@ int main(int argc, const char **argv)
     ObjectReadOperation o;
     o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val);
     r = io_ctx.operate(oid, &o, &bl2);
-    cout << " got " << r << " wanted ECANCELED" << std::endl;
+    cout << " got " << r << " wanted " << -ECANCELED << " (-ECANCELED)" << std::endl;
     assert(r == -ECANCELED);
   }
 
@@ -279,7 +268,7 @@ int main(int argc, const char **argv)
     io_ctx.locator_set_key(oid);
     o.write_full(val);
     r = io_ctx.operate(oidb, &o);
-    cout << " got " << r << " wanted ECANCELED" << std::endl;
+    cout << " got " << r << " wanted " << -ECANCELED << " (-ECANCELED)" << std::endl;
     assert(r == -ECANCELED);
   }
   {
diff --git a/src/vstart.sh b/src/vstart.sh
index 7132ad3..5349c49 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -460,7 +460,7 @@ EOF
 	    uuid=`uuidgen`
 	    echo "add osd$osd $uuid"
 	    $SUDO $CEPH_ADM osd create $uuid
-	    $SUDO $CEPH_ADM osd crush add osd.$osd 1.0 host=localhost rack=localrack root=default
+	    $SUDO $CEPH_ADM osd crush add osd.$osd 1.0 host=$HOSTNAME root=default
 	    $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --mkkey --osd-uuid $uuid
 
 	    key_fn=$CEPH_DEV_DIR/osd$osd/keyring

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list