[Pkg-ceph-commits] [ceph] 01/02: Imported Upstream version 0.80.9

Gaudenz Steinlin gaudenz at moszumanska.debian.org
Fri May 1 22:18:44 UTC 2015


This is an automated email from the git hooks/post-receive script.

gaudenz pushed a commit to branch master
in repository ceph.

commit 4b4ee2c92a3d179818d3af4d41b3d266da95acd8
Author: Gaudenz Steinlin <gaudenz at debian.org>
Date:   Sat May 2 00:18:08 2015 +0200

    Imported Upstream version 0.80.9
---
 ceph.spec                                          |   65 +-
 ceph.spec.in                                       |   63 +-
 configure                                          |   22 +-
 configure.ac                                       |    2 +-
 man/ceph.8                                         | 2307 +++++++++++++++++++-
 src/.git_version                                   |    4 +-
 src/Makefile-env.am                                |    1 +
 src/Makefile.am                                    |    1 +
 src/Makefile.in                                    |  358 ++-
 src/ceph-disk                                      |  114 +-
 src/ceph.in                                        |    6 +-
 src/ceph_mon.cc                                    |   20 +
 src/civetweb/civetweb.h                            |   24 +-
 src/civetweb/include/civetweb.h                    |   24 +-
 src/civetweb/include/civetweb_conf.h               |    6 +
 src/civetweb/src/civetweb.c                        |  771 +++++--
 src/client/Client.cc                               |  374 +++-
 src/client/Client.h                                |   23 +-
 src/client/Fh.h                                    |    8 +-
 src/client/Inode.h                                 |   11 +-
 src/client/MetaRequest.h                           |   14 +-
 src/client/fuse_ll.cc                              |   77 +-
 src/common/Makefile.am                             |    3 +-
 src/common/Thread.cc                               |    3 +-
 src/common/Thread.h                                |    1 +
 src/common/TrackedOp.cc                            |    2 +-
 src/common/WorkQueue.cc                            |    4 +
 src/common/ceph_crypto.h                           |    8 +-
 src/common/config.cc                               |    4 +-
 src/common/config_opts.h                           |    7 +-
 src/common/crc32c_intel_fast_asm.S                 |    2 +
 src/common/crc32c_intel_fast_zero_asm.S            |    2 +
 src/common/hobject.cc                              |    4 +-
 src/common/io_priority.cc                          |    4 +-
 src/common/util.cc                                 |   22 +
 src/crush/CrushCompiler.cc                         |    4 +
 src/crush/CrushTester.cc                           |   16 +-
 src/crush/CrushTester.h                            |    9 +
 src/crush/CrushWrapper.cc                          |  108 +-
 src/crush/CrushWrapper.h                           |   75 +-
 src/crush/builder.c                                |  243 ++-
 src/crush/builder.h                                |   11 +-
 src/crush/crush.h                                  |    8 +
 src/crush/mapper.c                                 |   16 +-
 src/include/ceph_features.h                        |    2 +
 src/include/ceph_fs.h                              |    6 +-
 src/include/util.h                                 |   24 +
 src/init-radosgw.sysv                              |    6 +-
 src/librbd/AioCompletion.h                         |   12 +
 src/librbd/ImageCtx.cc                             |   14 +-
 src/librbd/ImageCtx.h                              |   10 +-
 src/librbd/internal.cc                             |   81 +-
 src/mds/CInode.h                                   |    1 +
 src/mds/Dumper.cc                                  |   23 +-
 src/mds/Locker.cc                                  |   20 +-
 src/mds/MDCache.cc                                 |    8 +-
 src/mds/Makefile.am                                |    1 -
 src/mds/Server.cc                                  |   21 +-
 src/mds/flock.cc                                   |   69 +-
 src/mds/flock.h                                    |    2 +
 src/mds/mdstypes.h                                 |    4 +-
 src/messages/MClientReconnect.h                    |    6 +-
 src/mon/DataHealthService.cc                       |   39 +-
 src/mon/MonCommands.h                              |   12 +
 src/mon/Monitor.cc                                 |   27 +-
 src/mon/MonitorDBStore.h                           |   27 +-
 src/mon/OSDMonitor.cc                              |  169 +-
 src/mon/PGMap.cc                                   |   37 +-
 src/mon/PGMonitor.cc                               |    1 +
 src/mon/Paxos.cc                                   |  169 +-
 src/mon/Paxos.h                                    |   44 +
 src/mon/mon_types.h                                |   45 +-
 src/os/FileJournal.cc                              |   89 +-
 src/osd/ECBackend.cc                               |   50 +-
 src/osd/OSD.cc                                     |  108 +-
 src/osd/OSD.h                                      |    6 +-
 src/osd/OSDMap.h                                   |   12 +
 src/osd/PG.cc                                      |  106 +-
 src/osd/PG.h                                       |   19 +-
 src/osd/ReplicatedPG.cc                            |  112 +-
 src/osd/ReplicatedPG.h                             |    6 +-
 src/osd/Watch.h                                    |    7 +-
 src/osd/osd_types.cc                               |   14 +-
 src/osdc/ObjectCacher.cc                           |   47 +-
 src/osdc/ObjectCacher.h                            |    3 +
 src/osdc/Objecter.cc                               |    9 +-
 src/pybind/rados.py                                |    8 +-
 src/rgw/Makefile.am                                |   20 +-
 src/rgw/rgw_admin.cc                               |   14 +-
 src/rgw/rgw_civetweb.cc                            |    6 +-
 src/rgw/rgw_civetweb_log.cc                        |   14 +
 src/rgw/rgw_civetweb_log.h                         |    6 +
 src/rgw/rgw_common.cc                              |    5 +-
 src/rgw/rgw_common.h                               |    2 +-
 src/rgw/rgw_fcgi.cc                                |    4 +-
 src/rgw/rgw_http_client.cc                         |   52 +-
 src/rgw/rgw_json_enc.cc                            |    2 +-
 src/rgw/rgw_main.cc                                |   35 +-
 src/rgw/rgw_op.cc                                  |   88 +-
 src/rgw/rgw_op.h                                   |    4 +-
 src/rgw/rgw_rados.cc                               |  161 +-
 src/rgw/rgw_rados.h                                |   19 +-
 src/rgw/rgw_rest.cc                                |   38 +-
 src/rgw/rgw_rest.h                                 |    1 -
 src/rgw/rgw_rest_s3.cc                             |   47 +-
 src/rgw/rgw_rest_swift.cc                          |   30 +-
 src/rgw/rgw_swift.cc                               |   32 +-
 src/rgw/rgw_swift.h                                |    1 +
 src/rgw/rgw_swift_auth.cc                          |    9 +-
 src/rgw/rgw_swift_auth.h                           |    2 +-
 src/rgw/rgw_user.cc                                |   11 +-
 src/test/Makefile.am                               |   10 +-
 src/test/cli/crushtool/add-item-in-tree.t          |   10 +
 src/test/cli/crushtool/adjust-item-weight.t        |   17 +
 src/test/cli/crushtool/build.t                     |    2 +-
 src/test/cli/crushtool/help.t                      |    1 +
 src/test/cli/crushtool/set-choose.t                |    9 +-
 src/test/cli/crushtool/simple.template.adj.one     |   56 +
 src/test/cli/crushtool/simple.template.adj.three   |   64 +
 src/test/cli/crushtool/simple.template.adj.two     |   64 +
 src/test/cli/crushtool/test-map-bobtail-tunables.t |    2 +-
 src/test/cli/crushtool/test-map-firefly-tunables.t |    2 +-
 src/test/cli/crushtool/test-map-firstn-indep.t     |   14 +
 src/test/cli/crushtool/test-map-firstn-indep.txt   |  443 ++++
 src/test/cli/crushtool/test-map-indep.t            |    2 +-
 src/test/cli/crushtool/test-map-legacy-tunables.t  |    2 +-
 src/test/cli/crushtool/test-map-tries-vs-retries.t |    2 +-
 src/test/cli/crushtool/test-map-vary-r-0.t         |    2 +-
 src/test/cli/crushtool/test-map-vary-r-1.t         |    2 +-
 src/test/cli/crushtool/test-map-vary-r-2.t         |    2 +-
 src/test/cli/crushtool/test-map-vary-r-3.t         |    2 +-
 src/test/cli/crushtool/test-map-vary-r-4.t         |    2 +-
 src/test/cli/crushtool/tree.template               |  Bin 0 -> 376 bytes
 src/test/cli/crushtool/tree.template.final         |   70 +
 src/test/cli/osdmaptool/create-print.t             |    1 +
 src/test/cli/osdmaptool/create-racks.t             |    1 +
 src/test/cli/osdmaptool/crush.t                    |    2 +-
 src/test/cli/osdmaptool/help.t                     |    1 +
 src/test/cli/osdmaptool/missing-argument.t         |    1 +
 src/test/cli/osdmaptool/test-map-pgs.t             |    4 +-
 src/test/common/histogram.cc                       |  113 +-
 src/test/common/test_io_priority.cc                |   51 +
 src/test/crush/TestCrushWrapper.cc                 |  270 +++
 src/test/crush/indep.cc                            |   33 +-
 src/test/erasure-code/Makefile.am                  |    8 +
 src/test/erasure-code/TestErasureCodeJerasure.cc   |   30 -
 .../ceph_erasure_code_non_regression.cc            |  325 +++
 src/test/libcephfs/test.cc                         |    1 +
 src/test/librados/misc.cc                          |    6 +-
 src/test/librados/snapshots.cc                     |   18 +
 src/test/librados/tier.cc                          |   76 +
 src/test/librbd/test_librbd.cc                     |  106 +
 src/test/mon/mon-test-helpers.sh                   |    3 +-
 src/tools/crushtool.cc                             |   16 +-
 src/tools/osdmaptool.cc                            |   11 +-
 155 files changed, 7297 insertions(+), 1490 deletions(-)

diff --git a/ceph.spec b/ceph.spec
index 1e9a2a6..4984c05 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -9,13 +9,16 @@
 # common
 #################################################################################
 Name:		ceph
-Version:	0.80.7
+Version:	0.80.9
 Release:	0%{?dist}
 Summary:	User space components of the Ceph file system
 License:	GPL-2.0
 Group:		System Environment/Base
 URL:		http://ceph.com/
 Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
+%if 0%{?fedora} || 0%{?centos} || 0%{?rhel}
+Patch0:		init-ceph.in-fedora.patch
+%endif
 Requires:	librbd1 = %{version}-%{release}
 Requires:	librados2 = %{version}-%{release}
 Requires:	libcephfs1 = %{version}-%{release}
@@ -24,13 +27,13 @@ Requires:	python
 Requires:	python-argparse
 Requires:	python-ceph
 Requires:	python-requests
+Requires:	python-flask
 Requires:	xfsprogs
 Requires:	cryptsetup
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
 Requires(post):	binutils
-BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:	make
 BuildRequires:	gcc-c++
 BuildRequires:	libtool
@@ -50,14 +53,14 @@ BuildRequires:	libblkid-devel >= 2.17
 BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	xfsprogs-devel
 BuildRequires:	yasm
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
 BuildRequires:	snappy-devel
 %endif
 
 #################################################################################
 # specific
 #################################################################################
-%if ! 0%{?rhel}
+%if ! 0%{?rhel} || 0%{?fedora}
 BuildRequires:	sharutils
 %endif
 
@@ -173,8 +176,8 @@ managers such as Pacemaker.
 Summary:	RADOS distributed object store client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
-Obsoletes:	ceph-libs
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{version}-%{release}
 %endif
 %description -n librados2
 RADOS is a reliable, autonomic distributed object storage cluster
@@ -187,8 +190,8 @@ Summary:	RADOS block device client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{version}-%{release}
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
-Obsoletes:	ceph-libs
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{version}-%{release}
 %endif
 %description -n librbd1
 RBD is a block device striped across multiple distributed objects in
@@ -200,8 +203,9 @@ shared library allowing applications to manage these block devices.
 Summary:	Ceph distributed file system client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
-Obsoletes:	ceph-libs
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{version}-%{release}
+Obsoletes:	ceph-libcephfs
 %endif
 %description -n libcephfs1
 Ceph is a distributed network file system designed to provide excellent
@@ -215,7 +219,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{version}-%{release}
 Requires:	librbd1 = %{version}-%{release}
-Requires:	python-flask
 %if 0%{defined suse_version}
 %py_requires
 %endif
@@ -264,6 +267,23 @@ BuildRequires:	junit4
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
+%package libs-compat
+Summary:	Meta package to include ceph libraries.
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Obsoletes:	ceph-libs
+Requires:	librados2 = %{version}-%{release}
+Requires:	librbd1 = %{version}-%{release}
+Requires:	libcephfs1 = %{version}-%{release}
+Provides:	ceph-libs
+
+%description libs-compat
+This is a meta package, that pulls in librados2, librbd1 and libcephfs1. It
+is included for backwards compatibility with distributions that depend on the
+former ceph-libs package, which is now split up into these three subpackages.
+Packages still depending on ceph-libs should be fixed to depend on librados2,
+librbd1 or libcephfs1 instead.
+
 %if 0%{?opensuse} || 0%{?suse_version}
 %debug_package
 %endif
@@ -273,6 +293,9 @@ This package contains the Java libraries for the Ceph File System.
 #################################################################################
 %prep
 %setup -q
+%if 0%{?fedora} || 0%{?rhel} || 0%{?centos}
+%patch0 -p1 -b .init
+%endif
 
 %build
 # Find jni.h
@@ -329,7 +352,7 @@ chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
 # udev rules
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
 install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
 %else
@@ -337,13 +360,13 @@ install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rul
 install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
 %endif
 
-%if (0%{?rhel} || 0%{?rhel} < 7)
+%if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
 %else
 install -m 0644 -D udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
 %endif
 
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 mv $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/95-ceph-osd.rules
 mv $RPM_BUILD_ROOT/sbin/mkcephfs $RPM_BUILD_ROOT/usr/sbin/mkcephfs
 mv $RPM_BUILD_ROOT/sbin/mount.ceph $RPM_BUILD_ROOT/usr/sbin/mount.ceph
@@ -404,6 +427,7 @@ fi
 %{_bindir}/cephfs
 %{_bindir}/ceph-clsinfo
 %{_bindir}/ceph-rest-api
+%{python_sitelib}/ceph_rest_api.py*
 %{_bindir}/crushtool
 %{_bindir}/monmaptool
 %{_bindir}/osdmaptool
@@ -424,7 +448,7 @@ fi
 %{_sbindir}/ceph-disk-udev
 %{_sbindir}/ceph-create-keys
 %{_sbindir}/rcceph
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 %{_sbindir}/mount.ceph
 %else
 /sbin/mount.ceph
@@ -451,7 +475,7 @@ fi
 %{_libdir}/ceph/erasure-code/libec_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_test_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_missing_entry_point.so*
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 /usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
 /usr/lib/udev/rules.d/95-ceph-osd.rules
 %else
@@ -529,7 +553,7 @@ fi
 %defattr(-,root,root,-)
 %{_bindir}/ceph-fuse
 %{_mandir}/man8/ceph-fuse.8*
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 %{_sbindir}/mount.fuse.ceph
 %else
 /sbin/mount.fuse.ceph
@@ -624,7 +648,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 /usr/lib/udev/rules.d/50-rbd.rules
 %else
 /lib/udev/rules.d/50-rbd.rules
@@ -656,7 +680,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{python_sitelib}/rbd.py*
 %{python_sitelib}/cephfs.py*
 %{python_sitelib}/ceph_argparse.py*
-%{python_sitelib}/ceph_rest_api.py*
 
 #################################################################################
 %files -n rest-bench
@@ -702,4 +725,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
 
+%files libs-compat
+# We need an empty %files list for ceph-libs-compat, to tell rpmbuild to actually
+# build this meta package.
+
 %changelog
diff --git a/ceph.spec.in b/ceph.spec.in
index 5454454..02b300b 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -16,6 +16,9 @@ License:	GPL-2.0
 Group:		System Environment/Base
 URL:		http://ceph.com/
 Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
+%if 0%{?fedora} || 0%{?centos} || 0%{?rhel}
+Patch0:		init-ceph.in-fedora.patch
+%endif
 Requires:	librbd1 = %{version}-%{release}
 Requires:	librados2 = %{version}-%{release}
 Requires:	libcephfs1 = %{version}-%{release}
@@ -24,13 +27,13 @@ Requires:	python
 Requires:	python-argparse
 Requires:	python-ceph
 Requires:	python-requests
+Requires:	python-flask
 Requires:	xfsprogs
 Requires:	cryptsetup
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
 Requires(post):	binutils
-BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:	make
 BuildRequires:	gcc-c++
 BuildRequires:	libtool
@@ -50,14 +53,14 @@ BuildRequires:	libblkid-devel >= 2.17
 BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	xfsprogs-devel
 BuildRequires:	yasm
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
 BuildRequires:	snappy-devel
 %endif
 
 #################################################################################
 # specific
 #################################################################################
-%if ! 0%{?rhel}
+%if ! 0%{?rhel} || 0%{?fedora}
 BuildRequires:	sharutils
 %endif
 
@@ -173,8 +176,8 @@ managers such as Pacemaker.
 Summary:	RADOS distributed object store client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
-Obsoletes:	ceph-libs
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{version}-%{release}
 %endif
 %description -n librados2
 RADOS is a reliable, autonomic distributed object storage cluster
@@ -187,8 +190,8 @@ Summary:	RADOS block device client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{version}-%{release}
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
-Obsoletes:	ceph-libs
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{version}-%{release}
 %endif
 %description -n librbd1
 RBD is a block device striped across multiple distributed objects in
@@ -200,8 +203,9 @@ shared library allowing applications to manage these block devices.
 Summary:	Ceph distributed file system client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
-Obsoletes:	ceph-libs
+%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{version}-%{release}
+Obsoletes:	ceph-libcephfs
 %endif
 %description -n libcephfs1
 Ceph is a distributed network file system designed to provide excellent
@@ -215,7 +219,6 @@ Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{version}-%{release}
 Requires:	librbd1 = %{version}-%{release}
-Requires:	python-flask
 %if 0%{defined suse_version}
 %py_requires
 %endif
@@ -264,6 +267,23 @@ BuildRequires:	junit4
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
+%package libs-compat
+Summary:	Meta package to include ceph libraries.
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Obsoletes:	ceph-libs
+Requires:	librados2 = %{version}-%{release}
+Requires:	librbd1 = %{version}-%{release}
+Requires:	libcephfs1 = %{version}-%{release}
+Provides:	ceph-libs
+
+%description libs-compat
+This is a meta package, that pulls in librados2, librbd1 and libcephfs1. It
+is included for backwards compatibility with distributions that depend on the
+former ceph-libs package, which is now split up into these three subpackages.
+Packages still depending on ceph-libs should be fixed to depend on librados2,
+librbd1 or libcephfs1 instead.
+
 %if 0%{?opensuse} || 0%{?suse_version}
 %debug_package
 %endif
@@ -273,6 +293,9 @@ This package contains the Java libraries for the Ceph File System.
 #################################################################################
 %prep
 %setup -q
+%if 0%{?fedora} || 0%{?rhel} || 0%{?centos}
+%patch0 -p1 -b .init
+%endif
 
 %build
 # Find jni.h
@@ -329,7 +352,7 @@ chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
 # udev rules
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
 install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
 %else
@@ -337,13 +360,13 @@ install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rul
 install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
 %endif
 
-%if (0%{?rhel} || 0%{?rhel} < 7)
+%if (0%{?rhel} && 0%{?rhel} < 7)
 install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
 %else
 install -m 0644 -D udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
 %endif
 
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 mv $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/95-ceph-osd.rules
 mv $RPM_BUILD_ROOT/sbin/mkcephfs $RPM_BUILD_ROOT/usr/sbin/mkcephfs
 mv $RPM_BUILD_ROOT/sbin/mount.ceph $RPM_BUILD_ROOT/usr/sbin/mount.ceph
@@ -404,6 +427,7 @@ fi
 %{_bindir}/cephfs
 %{_bindir}/ceph-clsinfo
 %{_bindir}/ceph-rest-api
+%{python_sitelib}/ceph_rest_api.py*
 %{_bindir}/crushtool
 %{_bindir}/monmaptool
 %{_bindir}/osdmaptool
@@ -424,7 +448,7 @@ fi
 %{_sbindir}/ceph-disk-udev
 %{_sbindir}/ceph-create-keys
 %{_sbindir}/rcceph
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 %{_sbindir}/mount.ceph
 %else
 /sbin/mount.ceph
@@ -451,7 +475,7 @@ fi
 %{_libdir}/ceph/erasure-code/libec_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_test_jerasure*.so*
 %{_libdir}/ceph/erasure-code/libec_missing_entry_point.so*
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 /usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
 /usr/lib/udev/rules.d/95-ceph-osd.rules
 %else
@@ -529,7 +553,7 @@ fi
 %defattr(-,root,root,-)
 %{_bindir}/ceph-fuse
 %{_mandir}/man8/ceph-fuse.8*
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 %{_sbindir}/mount.fuse.ceph
 %else
 /sbin/mount.fuse.ceph
@@ -624,7 +648,7 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
-%if 0%{?rhel} >= 7
+%if 0%{?rhel} >= 7 || 0%{?fedora}
 /usr/lib/udev/rules.d/50-rbd.rules
 %else
 /lib/udev/rules.d/50-rbd.rules
@@ -656,7 +680,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{python_sitelib}/rbd.py*
 %{python_sitelib}/cephfs.py*
 %{python_sitelib}/ceph_argparse.py*
-%{python_sitelib}/ceph_rest_api.py*
 
 #################################################################################
 %files -n rest-bench
@@ -702,4 +725,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
 
+%files libs-compat
+# We need an empty %files list for ceph-libs-compat, to tell rpmbuild to actually
+# build this meta package.
+
 %changelog
diff --git a/configure b/configure
index d8d4d5c..1953007 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ceph 0.80.7.
+# Generated by GNU Autoconf 2.68 for ceph 0.80.9.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -570,8 +570,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.80.7'
-PACKAGE_STRING='ceph 0.80.7'
+PACKAGE_VERSION='0.80.9'
+PACKAGE_STRING='ceph 0.80.9'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1441,7 +1441,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.80.7 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80.9 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1512,7 +1512,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.80.7:";;
+     short | recursive ) echo "Configuration of ceph 0.80.9:";;
    esac
   cat <<\_ACEOF
 
@@ -1657,7 +1657,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.80.7
+ceph configure 0.80.9
 generated by GNU Autoconf 2.68
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -2682,7 +2682,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.80.7, which was
+It was created by ceph $as_me 0.80.9, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   $ $0 $@
@@ -4682,7 +4682,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80.7'
+ VERSION='0.80.9'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12660,7 +12660,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80.7'
+ VERSION='0.80.9'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -22464,7 +22464,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.80.7, which was
+This file was extended by ceph $as_me 0.80.9, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22530,7 +22530,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.80.7
+ceph config.status 0.80.9
 configured by $0, generated by GNU Autoconf 2.68,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 7255c7c..4621738 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.80.7], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80.9], [ceph-devel at vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/man/ceph.8 b/man/ceph.8
index 9bb903c..5f7b8dc 100644
--- a/man/ceph.8
+++ b/man/ceph.8
@@ -1,8 +1,8 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH" "8" "December 18, 2014" "dev" "Ceph"
 .SH NAME
-ceph \- ceph file system control utility
+ceph \- ceph administration tool
 .
 .nr rst2man-indent-level 0
 .
@@ -59,103 +59,2320 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 ..
 .SH SYNOPSIS
 .nf
-\fBceph\fP [ \-m \fImonaddr\fP ] [ \-w | \fIcommand\fP ... ]
+\fBceph\fP \fBauth\fP \fIadd\fP \fI<entity>\fP {\fI<caps>\fP [\fI<caps>\fP\&...]}
+.fi
+.sp
+.nf
+\fBceph\fP \fBauth\fP \fIexport\fP \fI<entity>\fP
+.fi
+.sp
+.nf
+\fBceph\fP \fBconfig\-key\fP \fIget\fP \fI<key>\fP
+.fi
+.sp
+.nf
+\fBceph\fP \fBmds\fP \fIadd_data_pool\fP \fI<pool>\fP
+.fi
+.sp
+.nf
+\fBceph\fP \fBmds\fP \fIgetmap\fP {\fI<int[0\-]>\fP}
+.fi
+.sp
+.nf
+\fBceph\fP \fBmon\fP \fIadd\fP \fI<name>\fP <\fIIPaddr[:port]\fP>
+.fi
+.sp
+.nf
+\fBceph\fP \fBmon_status\fP
+.fi
+.sp
+.nf
+\fBceph\fP \fBosd\fP \fIcreate\fP {\fI<uuid>\fP}
+.fi
+.sp
+.nf
+\fBceph\fP \fBosd\fP \fBcrush\fP \fIadd\fP \fI<osdname (id|osd.id)>\fP
+.fi
+.sp
+.sp
+\fI<float[0.0\-]>\fP \fI<args>\fP [\fI<args>\fP\&...]
+.nf
+\fBceph\fP \fBpg\fP \fIforce_create_pg\fP \fI<pgid>\fP
+.fi
+.sp
+.nf
+\fBceph\fP \fBpg\fP \fIstat\fP
+.fi
+.sp
+.nf
+\fBceph\fP \fBquorum_status\fP
 .fi
 .sp
 .SH DESCRIPTION
 .sp
-\fBceph\fP is a control utility for communicating with the monitor
-cluster of a running Ceph distributed storage system.
+\fBceph\fP is a control utility which is used for manual deployment and maintenance
+of a Ceph cluster. It provides a diverse set of commands that allows deployment of
+monitors, OSDs, placement groups, MDS and overall maintenance, administration
+of the cluster.
+.SH COMMANDS
+.SS auth
 .sp
-There are three basic modes of operation.
-.SS Interactive mode
+Manage authentication keys. It is used for adding, removing, exporting
+or updating of authentication keys for a particular  entity such as a monitor or
+OSD. It uses some additional subcommands.
 .sp
-To start in interactive mode, no arguments are necessary. Control\-d or
-\(aqquit\(aq will exit.
-.SS Watch mode
+Subcommand \fBadd\fP adds authentication info for a particular entity from input
+file, or random key if no input given and/or any caps specified in the command.
 .sp
-Watch mode shows cluster state changes as they occur. For example:
+Usage:
 .INDENT 0.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
-ceph \-w
+ceph auth add <entity> {<caps> [<caps>...]}
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
-.SS Command line mode
 .sp
-Finally, to send a single instruction to the monitor cluster (and wait
-for a response), the command can be specified on the command line.
-.SH OPTIONS
+Subcommand \fBcaps\fP updates caps for \fBname\fP from caps specified in the command.
+.sp
+Usage:
 .INDENT 0.0
-.TP
-.B \-i infile
-will specify an input file to be passed along as a payload with the
-command to the monitor cluster. This is only used for specific
-monitor commands.
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth caps <entity> <caps> [<caps>...]
+.ft P
+.fi
+.UNINDENT
 .UNINDENT
+.sp
+Subcommand \fBdel\fP deletes all caps for \fBname\fP\&.
+.sp
+Usage:
 .INDENT 0.0
-.TP
-.B \-o outfile
-will write any payload returned by the monitor cluster with its
-reply to outfile.  Only specific monitor commands (e.g. osd getmap)
-return a payload.
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth del <entity>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBexport\fP writes keyring for requested entity, or master keyring if
+none given.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth export {<entity>}
+.ft P
+.fi
 .UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBget\fP writes keyring file with requested key.
+.sp
+Usage:
 .INDENT 0.0
-.TP
-.B \-c ceph.conf, \-\-conf=ceph.conf
-Use ceph.conf configuration file instead of the default
-/etc/ceph/ceph.conf to determine monitor addresses during startup.
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth get <entity>
+.ft P
+.fi
 .UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBget\-key\fP displays requested key.
+.sp
+Usage:
 .INDENT 0.0
-.TP
-.B \-m monaddress[:port]
-Connect to specified monitor (instead of looking through ceph.conf).
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth get\-key <entity>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBget\-or\-create\fP adds authentication info for a particular entity
+from input file, or random key if no input given and/or any caps specified in the
+command.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth get\-or\-create <entity> {<caps> [<caps>...]}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBget\-or\-create\-key\fP gets or adds key for \fBname\fP from system/caps
+pairs specified in the command.  If key already exists, any given caps must match
+the existing caps for that key.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth get\-or\-create\-key <entity> {<caps> [<caps>...]}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBimport\fP reads keyring from input file.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth import
+.ft P
+.fi
+.UNINDENT
 .UNINDENT
-.SH EXAMPLES
 .sp
-To grab a copy of the current OSD map:
+Subcommand \fBlist\fP lists authentication state.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth list
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBprint\-key\fP displays requested key.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth print\-key <entity>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBprint_key\fP displays requested key.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph auth print_key <entity>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS compact
+.sp
+Causes compaction of monitor\(aqs leveldb storage.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph compact
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS config\-key
+.sp
+Manage configuration key. It uses some additional subcommands.
+.sp
+Subcommand \fBget\fP gets the configuration key.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph config\-key get <key>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBput\fP puts configuration key and values.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph config\-key put <key> {<val>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBexists\fP checks for configuration keys existence.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph config\-key exists <key>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBlist\fP lists configuration keys.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph config\-key list
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdel\fP deletes configuration key.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph config\-key del <key>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS df
+.sp
+Show cluster\(aqs free space status.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph df
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS fsid
+.sp
+Show cluster\(aqs FSID/UUID.
+.sp
+Usage:
 .INDENT 0.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
-ceph \-m 1.2.3.4:6789 osd getmap \-o osdmap
+ceph fsid
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
+.SS health
 .sp
-To get a dump of placement group (PG) state:
+Show cluster\(aqs health.
+.sp
+Usage:
 .INDENT 0.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
-ceph pg dump \-o pg.txt
+ceph health
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
-.SH MONITOR COMMANDS
+.SS heap
+.sp
+Show heap usage info (available only if compiled with tcmalloc)
 .sp
-A more complete summary of commands understood by the monitor cluster can be found in the
-online documentation, at
+Usage:
 .INDENT 0.0
 .INDENT 3.5
-\fI\%http://ceph.com/docs/master/rados/operations/control\fP
+.sp
+.nf
+.ft C
+ceph heap dump|start_profiler|stop_profiler|release|stats
+.ft P
+.fi
+.UNINDENT
 .UNINDENT
+.SS injectargs
+.sp
+Inject configuration arguments into monitor.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph injectargs <injected_args> [<injected_args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS log
+.sp
+Log supplied text to the monitor log.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph log <logtext> [<logtext>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS mds
+.sp
+Manage metadata server configuration and administration. It uses some
+additional subcommands.
+.sp
+Subcommand \fBadd_data_pool\fP adds data pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds add_data_pool <pool>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcluster_down\fP takes mds cluster down.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds cluster_down
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcluster_up\fP brings mds cluster up.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds cluster_up
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcompat\fP manages compatible features. It uses some additional
+subcommands.
+.sp
+Subcommand \fBrm_compat\fP removes compatible feature.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds compat rm_compat <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrm_incompat\fP removes incompatible feature.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds compat rm_incompat <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBshow\fP shows mds compatibility settings.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds compat show
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdeactivate\fP stops mds.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds deactivate <who>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump\fP dumps information, optionally from epoch.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds dump {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBfail\fP forces mds to status fail.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds fail <who>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBgetmap\fP gets MDS map, optionally from epoch.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds getmap {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBnewfs\fP makes new filesystem using pools <metadata> and <data>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds newfs <int[0\-]> <int[0\-]> {\-\-yes\-i\-really\-mean\-it}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBremove_data_pool\fP removes data pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds remove_data_pool <pool>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrm\fP removes inactive mds.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds rm <int[0\-]> <name> (type.id)>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrmfailed\fP removes failed mds.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds rmfailed <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset_max_mds\fP sets max MDS index.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds set_max_mds <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset_state\fP sets mds state of <gid> to <numeric\-state>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds set_state <int[0\-]> <int[0\-20]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBsetmap\fP sets mds map; must supply correct epoch number.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds setmap <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBstat\fP shows MDS status.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds stat
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBstop\fP stops mds.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds stop <who>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBtell\fP sends command to particular mds.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mds tell <who> <args> [<args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS mon
+.sp
+Manage monitor configuration and administration. It uses some additional
+subcommands.
+.sp
+Subcommand \fBadd\fP adds new monitor named <name> at <addr>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mon add <name> <IPaddr[:port]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump\fP dumps formatted monmap (optionally from epoch)
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mon dump {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBgetmap\fP gets monmap.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mon getmap {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBremove\fP removes monitor named <name>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mon remove <name>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBstat\fP summarizes monitor status.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mon stat
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBmon_status\fP reports status of monitors.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph mon_status
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS osd
+.sp
+Manage OSD configuration and administration. It uses some additional
+subcommands.
+.sp
+Subcommand \fBcreate\fP creates new osd (with optional UUID).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd create {<uuid>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcrush\fP is used for CRUSH management. It uses some additional
+subcommands.
+.sp
+Subcommand \fBadd\fP adds or updates crushmap position and weight for <name> with
+<weight> and location <args>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush add <osdname (id|osd.id)> <float[0.0\-]> <args> [<args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBadd\-bucket\fP adds no\-parent (probably root) crush bucket <name> of
+type <type>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush add\-bucket <name> <type>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcreate\-or\-move\fP creates entry or moves existing entry for <name>
+<weight> at/to location <args>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush create\-or\-move <osdname (id|osd.id)> <float[0.0\-]> <args>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+[<args>...]
+.sp
+Subcommand \fBdump\fP dumps crush map.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush dump
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBlink\fP links existing entry for <name> under location <args>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush link <name> <args> [<args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBmove\fP moves existing entry for <name> to location <args>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush move <name> <args> [<args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBremove\fP removes <name> from crush map (everywhere, or just at
+<ancestor>).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush remove <name> {<ancestor>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBreweight\fP change <name>\(aqs weight to <weight> in crush map.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush reweight <name> <float[0.0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrm\fP removes <name> from crush map (everywhere, or just at
+<ancestor>).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rm <name> {<ancestor>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrule\fP is used for creating crush rules. It uses some additional
+subcommands.
+.sp
+Subcommand \fBcreate\-erasure\fP creates crush rule <name> for erasure coded pool
+created with <profile> (default default).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rule create\-erasure <name> {<profile>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcreate\-simple\fP creates crush rule <name> to start from <root>,
+replicate across buckets of type <type>, using a choose mode of <firstn|indep>
+(default firstn; indep best for erasure pools).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rule create\-simple <name> <root> <type> {firstn|indep}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump\fP dumps crush rule <name> (default all).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rule dump {<name>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBlist\fP lists crush rules.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rule list
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBls\fP lists crush rules.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rule ls
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrm\fP removes crush rule <name>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush rule rm <name>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\fP sets crush map from input file.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush set
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\fP with osdname/osd.id update crushmap position and weight
+for <name> to <weight> with location <args>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush set <osdname (id|osd.id)> <float[0.0\-]> <args> [<args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBshow\-tunables\fP shows current crush tunables.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush show\-tunables
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBtunables\fP sets crush tunables values to <profile>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush tunables legacy|argonaut|bobtail|firefly|optimal|default
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBunlink\fP unlinks <name> from crush map (everywhere, or just at
+<ancestor>).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush unlink <name> {<ancestor>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdeep\-scrub\fP initiates deep scrub on specified osd.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd deep\-scrub <who>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdown\fP sets osd(s) <id> [<id>...] down.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd down <ids> [<ids>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump\fP prints summary of OSD map.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd dump {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBerasure\-code\-profile\fP is used for managing the erasure code
+profiles. It uses some additional subcommands.
+.sp
+Subcommand \fBget\fP gets erasure code profile <name>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd erasure\-code\-profile get <name>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBls\fP lists all erasure code profiles.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd erasure\-code\-profile ls
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrm\fP removes erasure code profile <name>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd erasure\-code\-profile rm <name>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\fP creates erasure code profile <name> with [<key[=value]> ...]
+pairs. Add a \-\-force at the end to override an existing profile (IT IS RISKY).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd erasure\-code\-profile set <name> {<profile> [<profile>...]}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBfind\fP find osd <id> in the CRUSH map and shows its location.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd find <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBgetcrushmap\fP gets CRUSH map.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd getcrushmap {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBgetmap\fP gets OSD map.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd getmap {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBgetmaxosd\fP shows largest OSD id.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd getmaxosd
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBin\fP sets osd(s) <id> [<id>...] in.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd in <ids> [<ids>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBlost\fP marks osd as permanently lost. THIS DESTROYS DATA IF NO
+MORE REPLICAS EXIST, BE CAREFUL.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd lost <int[0\-]> {\-\-yes\-i\-really\-mean\-it}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBls\fP shows all OSD ids.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd ls {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBlspools\fP lists pools.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd lspools {<int>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBmap\fP finds pg for <object> in <pool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd map <poolname> <objectname>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBmetadata\fP fetches metadata for osd <id>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd metadata <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBout\fP sets osd(s) <id> [<id>...] out.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd out <ids> [<ids>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBpause\fP pauses osd.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pause
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBperf\fP prints dump of OSD perf summary stats.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd perf
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBpg\-temp\fP set pg_temp mapping pgid:[<id> [<id>...]] (developers
+only).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pg\-temp <pgid> {<id> [<id>...]}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBpool\fP is used for managing data pools. It uses some additional
+subcommands.
+.sp
+Subcommand \fBcreate\fP creates pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool create <poolname> <int[0\-]> {<int[0\-]>} {replicated|erasure}
+{<erasure_code_profile>} {<ruleset>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdelete\fP deletes pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool delete <poolname> {<poolname>} {\-\-yes\-i\-really\-really\-mean\-it}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBget\fP gets pool parameter <var>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool get <poolname> size|min_size|crash_replay_interval|pg_num|
+pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|
+
+ceph osd pool get <poolname> auid|target_max_objects|target_max_bytes
+
+ceph osd pool get <poolname> cache_target_dirty_ratio|cache_target_full_ratio
+
+ceph osd pool get <poolname> cache_min_flush_age|cache_min_evict_age|
+erasure_code_profile
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBget\-quota\fP obtains object or byte limits for pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool get\-quota <poolname>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBmksnap\fP makes snapshot <snap> in <pool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool mksnap <poolname> <snap>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrename\fP renames <srcpool> to <destpool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool rename <poolname> <poolname>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrmsnap\fP removes snapshot <snap> from <pool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool rmsnap <poolname> <snap>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\fP sets pool parameter <var> to <val>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool set <poolname> size|min_size|crash_replay_interval|pg_num|
+pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|
+
+ceph osd pool set <poolname> hit_set_count|hit_set_fpp|debug_fake_ec_pool
+
+ceph osd pool set <poolname> target_max_bytes|target_max_objects
+
+ceph osd pool set <poolname> cache_target_dirty_ratio|cache_target_full_ratio
+
+ceph osd pool set <poolname> cache_min_flush_age
+
+ceph osd pool set <poolname> cache_min_evict_age|auid <val>
+{\-\-yes\-i\-really\-mean\-it}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\-quota\fP sets object or byte limit on pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool set\-quota <poolname> max_objects|max_bytes <val>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBstats\fP obtain stats from all pools, or from specified pool.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool stats {<name>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBprimary\-affinity\fP adjust osd primary\-affinity from 0.0 <=<weight>
+<= 1.0
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd primary\-affinity <osdname (id|osd.id)> <float[0.0\-1.0]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBprimary\-temp\fP sets primary_temp mapping pgid:<id>|\-1 (developers
+only).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd primary\-temp <pgid> <id>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrepair\fP initiates repair on a specified osd.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd repair <who>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBreweight\fP reweights osd to 0.0 < <weight> < 1.0.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+osd reweight <int[0\-]> <float[0.0\-1.0]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBreweight\-by\-utilization\fP reweight OSDs by utilization
+[overload\-percentage\-for\-consideration, default 120].
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd reweight\-by\-utilization {<int[100\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrm\fP removes osd(s) <id> [<id>...] in the cluster.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd rm <ids> [<ids>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBscrub\fP initiates scrub on specified osd.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd scrub <who>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\fP sets <key>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd set pause|noup|nodown|noout|noin|nobackfill|norecover|noscrub|
+nodeep\-scrub|notieragent
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBsetcrushmap\fP sets crush map from input file.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd setcrushmap
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBsetmaxosd\fP sets new maximum osd value.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd setmaxosd <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBstat\fP prints summary of OSD map.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd stat
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBthrash\fP thrashes OSDs for <num_epochs>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd thrash <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBtier\fP is used for managing tiers. It uses some additional
+subcommands.
+.sp
+Subcommand \fBadd\fP adds the tier <tierpool> (the second one) to base pool <pool>
+(the first one).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tier add <poolname> <poolname> {\-\-force\-nonempty}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBadd\-cache\fP adds a cache <tierpool> (the second one) of size <size>
+to existing pool <pool> (the first one).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tier add\-cache <poolname> <poolname> <int[0\-]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBcache\-mode\fP specifies the caching mode for cache tier <pool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tier cache\-mode <poolname> none|writeback|forward|readonly
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBremove\fP removes the tier <tierpool> (the second one) from base pool
+<pool> (the first one).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tier remove <poolname> <poolname>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBremove\-overlay\fP removes the overlay pool for base pool <pool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tier remove\-overlay <poolname>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset\-overlay\fP set the overlay pool for base pool <pool> to be
+<overlaypool>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tier set\-overlay <poolname> <poolname>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBtree\fP prints OSD tree.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd tree {<int[0\-]>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBunpause\fP unpauses osd.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd unpause
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBunset\fP unsets <key>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+osd unset pause|noup|nodown|noout|noin|nobackfill|norecover|noscrub|
+nodeep\-scrub|notieragent
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS pg
+.sp
+It is used for managing the placement groups in OSDs. It uses some
+additional subcommands.
+.sp
+Subcommand \fBdebug\fP shows debug info about pgs.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg debug unfound_objects_exist|degraded_pgs_exist
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdeep\-scrub\fP starts deep\-scrub on <pgid>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg deep\-scrub <pgid>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump\fP shows human\-readable versions of pg map (only \(aqall\(aq valid
+with plain).
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg dump {all|summary|sum|delta|pools|osds|pgs|pgs_brief}
+
+ceph pg dump {all|summary|sum|delta|pools|osds|pgs|pgs_brief...}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump_json\fP shows human\-readable version of pg map in json only.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg dump_json {all|summary|sum|pools|osds|pgs[all|summary|sum|pools|
+osds|pgs...]}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump_pools_json\fP shows pg pools info in json only.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg dump_pools_json
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBdump_stuck\fP shows information about stuck pgs.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg dump_stuck {inactive|unclean|stale[inactive|unclean|stale...]}
+{<int>}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBforce_create_pg\fP forces creation of pg <pgid>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg force_create_pg <pgid>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBgetmap\fP gets binary pg map to \-o/stdout.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg getmap
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBmap\fP shows mapping of pg to osds.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg map <pgid>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBrepair\fP starts repair on <pgid>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg repair <pgid>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBscrub\fP starts scrub on <pgid>.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg scrub <pgid>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBsend_pg_creates\fP triggers pg creates to be issued.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg send_pg_creates
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset_full_ratio\fP sets ratio at which pgs are considered full.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg set_full_ratio <float[0.0\-1.0]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBset_nearfull_ratio\fP sets ratio at which pgs are considered nearly
+full.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg set_nearfull_ratio <float[0.0\-1.0]>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Subcommand \fBstat\fP shows placement group status.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph pg stat
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS quorum
+.sp
+Enter or exit quorum.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph quorum enter|exit
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS quorum_status
+.sp
+Reports status of monitor quorum.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph quorum_status
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS report
+.sp
+Reports full status of cluster, optional title tag strings.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph report {<tags> [<tags>...]}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS scrub
+.sp
+Scrubs the monitor stores.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph scrub
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS status
+.sp
+Shows cluster status.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph status
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS sync force
+.sp
+Forces sync of and clear monitor store.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph sync force {\-\-yes\-i\-really\-mean\-it} {\-\-i\-know\-what\-i\-am\-doing}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS tell
+.sp
+Sends a command to a specific daemon.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph tell <name (type.id)> <args> [<args>...]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SH OPTIONS
+.INDENT 0.0
+.TP
+.B \-i infile
+will specify an input file to be passed along as a payload with the
+command to the monitor cluster. This is only used for specific
+monitor commands.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-o outfile
+will write any payload returned by the monitor cluster with its
+reply to outfile.  Only specific monitor commands (e.g. osd getmap)
+return a payload.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-c ceph.conf, \-\-conf=ceph.conf
+Use ceph.conf configuration file instead of the default
+\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during startup.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-id CLIENT_ID, \-\-user CLIENT_ID
+Client id for authentication.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-name CLIENT_NAME, \-n CLIENT_NAME
+Client name for authentication.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-cluster CLUSTER
+Name of the Ceph cluster.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-admin\-daemon ADMIN_SOCKET
+Submit admin\-socket commands.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-admin\-socket ADMIN_SOCKET_NOPE
+You probably mean \-\-admin\-daemon
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-s, \-\-status
+Show cluster status.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-w, \-\-watch
+Watch live cluster changes.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-watch\-debug
+Watch debug events.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-watch\-info
+Watch info events.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-watch\-sec
+Watch security events.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-watch\-warn
+Watch warning events.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-watch\-error
+Watch error events.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-version, \-v
+Display version.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-verbose
+Make verbose.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-concise
+Make less verbose.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-f {json,json\-pretty,xml,xml\-pretty,plain}, \-\-format
+Format of output.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-connect\-timeout CLUSTER_TIMEOUT
+Set a timeout for connecting to the cluster.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\fP is part of the Ceph distributed storage system. Please refer to the Ceph documentation at
-\fI\%http://ceph.com/docs\fP for more information.
+\fBceph\fP is a part of the Ceph distributed storage system. Please refer to
+the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
-\fBceph\fP(8),
+\fBceph\-mon\fP(8),
+\fBceph\-osd\fP(8),
+\fBceph\-mds\fP(8)
 .SH COPYRIGHT
 2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA
 .\" Generated by docutils manpage writer.
diff --git a/src/.git_version b/src/.git_version
index 1727fed..b5dcc6d 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-6c0127fcb58008793d3c8b62d925bc91963672a3
-v0.80.7
+b5a67f0e1d15385bc0d60a6da6e7fc810bde6047
+v0.80.9
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index d62247b..b45b156 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -150,6 +150,7 @@ LIBCLIENT = libclient.la
 LIBCLIENT_FUSE = libclient_fuse.la
 LIBRADOS = librados.la
 LIBRGW = librgw.la
+LIBCIVETWEB = libcivetweb.la
 LIBRBD = librbd.la
 LIBCEPHFS = libcephfs.la
 LIBERASURE_CODE = liberasure_code.la
diff --git a/src/Makefile.am b/src/Makefile.am
index edec05e..9c394e8 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -258,6 +258,7 @@ TESTS = \
 	$(check_SCRIPTS)
 
 check-local:
+	$(top_srcdir)/qa/workunits/erasure-code/encode-decode-non-regression.sh 
 	$(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
 
 
diff --git a/src/Makefile.in b/src/Makefile.in
index afa524b..5d9ea8b 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -69,7 +69,8 @@ bin_PROGRAMS = $(am__EXEEXT_9) $(am__EXEEXT_10) ceph-dencoder$(EXEEXT) \
 	ceph-mon$(EXEEXT) ceph-osd$(EXEEXT) ceph-mds$(EXEEXT) \
 	cephfs$(EXEEXT) librados-config$(EXEEXT) ceph-syn$(EXEEXT) \
 	$(am__EXEEXT_12) $(am__EXEEXT_13)
-noinst_PROGRAMS = get_command_descriptions$(EXEEXT)
+noinst_PROGRAMS = ceph_erasure_code_non_regression$(EXEEXT) \
+	get_command_descriptions$(EXEEXT)
 sbin_PROGRAMS =
 su_sbin_PROGRAMS = $(am__EXEEXT_14)
 check_PROGRAMS = unittest_erasure_code_plugin$(EXEEXT) \
@@ -88,20 +89,21 @@ check_PROGRAMS = unittest_erasure_code_plugin$(EXEEXT) \
 	unittest_base64$(EXEEXT) unittest_ceph_argparse$(EXEEXT) \
 	unittest_ceph_compatset$(EXEEXT) unittest_osd_types$(EXEEXT) \
 	unittest_pglog$(EXEEXT) unittest_ecbackend$(EXEEXT) \
-	unittest_hitset$(EXEEXT) unittest_gather$(EXEEXT) \
-	unittest_run_cmd$(EXEEXT) unittest_signals$(EXEEXT) \
-	unittest_simple_spin$(EXEEXT) unittest_librados$(EXEEXT) \
-	unittest_bufferlist$(EXEEXT) unittest_crc32c$(EXEEXT) \
-	unittest_arch$(EXEEXT) unittest_crypto$(EXEEXT) \
-	unittest_crypto_init$(EXEEXT) unittest_perf_counters$(EXEEXT) \
-	unittest_admin_socket$(EXEEXT) unittest_ceph_crypto$(EXEEXT) \
-	unittest_utf8$(EXEEXT) unittest_mime$(EXEEXT) \
-	unittest_escape$(EXEEXT) unittest_chain_xattr$(EXEEXT) \
-	unittest_flatindex$(EXEEXT) unittest_strtol$(EXEEXT) \
-	unittest_confutils$(EXEEXT) unittest_config$(EXEEXT) \
-	unittest_context$(EXEEXT) unittest_heartbeatmap$(EXEEXT) \
-	unittest_formatter$(EXEEXT) unittest_libcephfs_config$(EXEEXT) \
-	unittest_lfnindex$(EXEEXT) unittest_librados_config$(EXEEXT) \
+	unittest_hitset$(EXEEXT) unittest_io_priority$(EXEEXT) \
+	unittest_gather$(EXEEXT) unittest_run_cmd$(EXEEXT) \
+	unittest_signals$(EXEEXT) unittest_simple_spin$(EXEEXT) \
+	unittest_librados$(EXEEXT) unittest_bufferlist$(EXEEXT) \
+	unittest_crc32c$(EXEEXT) unittest_arch$(EXEEXT) \
+	unittest_crypto$(EXEEXT) unittest_crypto_init$(EXEEXT) \
+	unittest_perf_counters$(EXEEXT) unittest_admin_socket$(EXEEXT) \
+	unittest_ceph_crypto$(EXEEXT) unittest_utf8$(EXEEXT) \
+	unittest_mime$(EXEEXT) unittest_escape$(EXEEXT) \
+	unittest_chain_xattr$(EXEEXT) unittest_flatindex$(EXEEXT) \
+	unittest_strtol$(EXEEXT) unittest_confutils$(EXEEXT) \
+	unittest_config$(EXEEXT) unittest_context$(EXEEXT) \
+	unittest_heartbeatmap$(EXEEXT) unittest_formatter$(EXEEXT) \
+	unittest_libcephfs_config$(EXEEXT) unittest_lfnindex$(EXEEXT) \
+	unittest_librados_config$(EXEEXT) \
 	unittest_daemon_config$(EXEEXT) unittest_osd_osdcap$(EXEEXT) \
 	unittest_mon_moncap$(EXEEXT) unittest_mon_pgmap$(EXEEXT) \
 	unittest_ipaddr$(EXEEXT) unittest_texttable$(EXEEXT) \
@@ -142,7 +144,7 @@ check_PROGRAMS = unittest_erasure_code_plugin$(EXEEXT) \
 @LINUX_TRUE at am__append_32 = -lrt
 @LINUX_TRUE at am__append_33 = -export-symbols-regex '^rados_.*'
 @LINUX_TRUE at am__append_34 = -export-symbols-regex '^rbd_.*'
- at WITH_RADOSGW_TRUE@am__append_35 = librgw.la
+ at WITH_RADOSGW_TRUE@am__append_35 = librgw.la libcivetweb.la
 @WITH_RADOSGW_TRUE at am__append_36 = \
 @WITH_RADOSGW_TRUE@	$(LIBRADOS) \
 @WITH_RADOSGW_TRUE@	libcls_rgw_client.la \
@@ -177,36 +179,37 @@ check_PROGRAMS = unittest_erasure_code_plugin$(EXEEXT) \
 @LINUX_TRUE at am__append_43 = -ldl
 @LINUX_TRUE at am__append_44 = -ldl
 @LINUX_TRUE at am__append_45 = -ldl
- at COMPILER_HAS_VTA_TRUE@am__append_46 = -fno-var-tracking-assignments
+ at LINUX_TRUE@am__append_46 = -ldl
 @COMPILER_HAS_VTA_TRUE at am__append_47 = -fno-var-tracking-assignments
- at WITH_BUILD_TESTS_TRUE@am__append_48 = test_build_libcommon \
+ at COMPILER_HAS_VTA_TRUE@am__append_48 = -fno-var-tracking-assignments
+ at WITH_BUILD_TESTS_TRUE@am__append_49 = test_build_libcommon \
 @WITH_BUILD_TESTS_TRUE@	test_build_librados test_build_librgw \
 @WITH_BUILD_TESTS_TRUE@	test_build_libcephfs
- at LINUX_TRUE@am__append_49 = ceph_kvstorebench \
+ at LINUX_TRUE@am__append_50 = ceph_kvstorebench \
 @LINUX_TRUE@	ceph_test_rados_list_parallel \
 @LINUX_TRUE@	ceph_test_rados_open_pools_parallel \
 @LINUX_TRUE@	ceph_test_rados_delete_pools_parallel \
 @LINUX_TRUE@	ceph_test_rados_watch_notify
- at LINUX_TRUE@am__append_50 = libsystest.la
- at LINUX_TRUE@am__append_51 = -ldl
- at WITH_RADOSGW_TRUE@am__append_52 = ceph_test_cors \
+ at LINUX_TRUE@am__append_51 = libsystest.la
+ at LINUX_TRUE@am__append_52 = -ldl
+ at WITH_RADOSGW_TRUE@am__append_53 = ceph_test_cors \
 @WITH_RADOSGW_TRUE@	ceph_test_rgw_manifest \
 @WITH_RADOSGW_TRUE@	ceph_test_cls_rgw_meta \
 @WITH_RADOSGW_TRUE@	ceph_test_cls_rgw_log \
 @WITH_RADOSGW_TRUE@	ceph_test_cls_rgw_opstate
- at LINUX_TRUE@am__append_53 = ceph_test_librbd_fsx
- at WITH_RADOSGW_TRUE@am__append_54 = ceph_test_cls_rgw
- at LINUX_TRUE@am__append_55 = ceph_test_objectstore
- at LINUX_TRUE@am__append_56 = -ldl
+ at LINUX_TRUE@am__append_54 = ceph_test_librbd_fsx
+ at WITH_RADOSGW_TRUE@am__append_55 = ceph_test_cls_rgw
+ at LINUX_TRUE@am__append_56 = ceph_test_objectstore
 @LINUX_TRUE at am__append_57 = -ldl
- at WITH_REST_BENCH_TRUE@am__append_58 = rest-bench
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_TRUE at am__append_59 = -ls3
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__append_60 = libs3/build/lib/libs3.a -lcurl -lxml2
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__append_61 = libs3
- at LINUX_TRUE@am__append_62 = mount.ceph
- at LINUX_TRUE@am__append_63 = rbd
- at WITH_FUSE_TRUE@am__append_64 = ceph-fuse rbd-fuse
- at ENABLE_CEPHFS_JAVA_TRUE@am__append_65 = libcephfs_jni.la
+ at LINUX_TRUE@am__append_58 = -ldl
+ at WITH_REST_BENCH_TRUE@am__append_59 = rest-bench
+ at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_TRUE at am__append_60 = -ls3
+ at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__append_61 = libs3/build/lib/libs3.a -lcurl -lxml2
+ at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__append_62 = libs3
+ at LINUX_TRUE@am__append_63 = mount.ceph
+ at LINUX_TRUE@am__append_64 = rbd
+ at WITH_FUSE_TRUE@am__append_65 = ceph-fuse rbd-fuse
+ at ENABLE_CEPHFS_JAVA_TRUE@am__append_66 = libcephfs_jni.la
 subdir = src
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
@@ -357,6 +360,19 @@ libcephfs_jni_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_CXXFLAGS) $(CXXFLAGS) $(libcephfs_jni_la_LDFLAGS) \
 	$(LDFLAGS) -o $@
 @ENABLE_CEPHFS_JAVA_TRUE at am_libcephfs_jni_la_rpath = -rpath $(libdir)
+libcivetweb_la_LIBADD =
+am__libcivetweb_la_SOURCES_DIST = rgw/rgw_civetweb.cc \
+	rgw/rgw_civetweb_log.cc civetweb/src/civetweb.c
+ at WITH_RADOSGW_TRUE@am_libcivetweb_la_OBJECTS =  \
+ at WITH_RADOSGW_TRUE@	rgw/libcivetweb_la-rgw_civetweb.lo \
+ at WITH_RADOSGW_TRUE@	rgw/libcivetweb_la-rgw_civetweb_log.lo \
+ at WITH_RADOSGW_TRUE@	civetweb/src/libcivetweb_la-civetweb.lo
+libcivetweb_la_OBJECTS = $(am_libcivetweb_la_OBJECTS)
+libcivetweb_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libcivetweb_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at WITH_RADOSGW_TRUE@am_libcivetweb_la_rpath =
 libclient_la_DEPENDENCIES = $(LIBOSDC) $(am__DEPENDENCIES_1)
 am_libclient_la_OBJECTS = client/Client.lo client/Inode.lo \
 	client/Dentry.lo client/MetaRequest.lo \
@@ -509,7 +525,7 @@ am_libcommon_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/bloom_filter.lo common/linux_version.lo mon/MonCap.lo \
 	mon/MonClient.lo mon/MonMap.lo osd/OSDMap.lo osd/osd_types.lo \
 	osd/ECMsgTypes.lo osd/HitSet.lo mds/MDSMap.lo \
-	mds/inode_backtrace.lo mds/mdstypes.lo
+	mds/inode_backtrace.lo mds/mdstypes.lo mds/flock.lo
 libcommon_la_OBJECTS = $(am_libcommon_la_OBJECTS)
 libcommon_crc_la_LIBADD =
 am__libcommon_crc_la_SOURCES_DIST = common/sctp_crc32.c \
@@ -707,11 +723,11 @@ am_liblog_la_OBJECTS = log/Log.lo log/SubsystemMap.lo
 liblog_la_OBJECTS = $(am_liblog_la_OBJECTS)
 libmds_la_DEPENDENCIES = $(LIBOSDC)
 am_libmds_la_OBJECTS = mds/Anchor.lo mds/Capability.lo mds/Dumper.lo \
-	mds/Resetter.lo mds/MDS.lo mds/flock.lo mds/locks.lo \
-	mds/journal.lo mds/Server.lo mds/Mutation.lo mds/MDCache.lo \
-	mds/Locker.lo mds/Migrator.lo mds/MDBalancer.lo mds/CDentry.lo \
-	mds/CDir.lo mds/CInode.lo mds/LogEvent.lo mds/MDSTable.lo \
-	mds/InoTable.lo mds/MDSTableClient.lo mds/MDSTableServer.lo \
+	mds/Resetter.lo mds/MDS.lo mds/locks.lo mds/journal.lo \
+	mds/Server.lo mds/Mutation.lo mds/MDCache.lo mds/Locker.lo \
+	mds/Migrator.lo mds/MDBalancer.lo mds/CDentry.lo mds/CDir.lo \
+	mds/CInode.lo mds/LogEvent.lo mds/MDSTable.lo mds/InoTable.lo \
+	mds/MDSTableClient.lo mds/MDSTableServer.lo \
 	mds/AnchorServer.lo mds/AnchorClient.lo mds/SnapRealm.lo \
 	mds/SnapServer.lo mds/snap.lo mds/SessionMap.lo mds/MDLog.lo \
 	mds/MDSUtility.lo
@@ -1073,6 +1089,13 @@ ceph_erasure_code_benchmark_OBJECTS =  \
 ceph_erasure_code_benchmark_DEPENDENCIES = $(am__DEPENDENCIES_10) \
 	$(LIBCOMMON) $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_6) \
 	$(am__DEPENDENCIES_1)
+am_ceph_erasure_code_non_regression_OBJECTS =  \
+	test/erasure-code/ceph_erasure_code_non_regression.$(OBJEXT)
+ceph_erasure_code_non_regression_OBJECTS =  \
+	$(am_ceph_erasure_code_non_regression_OBJECTS)
+ceph_erasure_code_non_regression_DEPENDENCIES =  \
+	$(am__DEPENDENCIES_10) $(LIBCOMMON) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_6) $(am__DEPENDENCIES_1)
 am_ceph_filestore_dump_OBJECTS = tools/ceph_filestore_dump.$(OBJEXT)
 ceph_filestore_dump_OBJECTS = $(am_ceph_filestore_dump_OBJECTS)
 ceph_filestore_dump_DEPENDENCIES = $(am__DEPENDENCIES_10) \
@@ -1774,8 +1797,7 @@ am__radosgw_SOURCES_DIST = rgw/rgw_resolve.cc rgw/rgw_rest.cc \
 	rgw/rgw_rest_log.cc rgw/rgw_rest_opstate.cc \
 	rgw/rgw_rest_replica_log.cc rgw/rgw_rest_config.cc \
 	rgw/rgw_http_client.cc rgw/rgw_swift.cc rgw/rgw_swift_auth.cc \
-	rgw/rgw_loadgen.cc rgw/rgw_civetweb.cc civetweb/src/civetweb.c \
-	rgw/rgw_main.cc
+	rgw/rgw_loadgen.cc rgw/rgw_main.cc
 @WITH_RADOSGW_TRUE at am_radosgw_OBJECTS = rgw/rgw_resolve.$(OBJEXT) \
 @WITH_RADOSGW_TRUE@	rgw/rgw_rest.$(OBJEXT) \
 @WITH_RADOSGW_TRUE@	rgw/rgw_rest_swift.$(OBJEXT) \
@@ -1793,11 +1815,9 @@ am__radosgw_SOURCES_DIST = rgw/rgw_resolve.cc rgw/rgw_rest.cc \
 @WITH_RADOSGW_TRUE@	rgw/rgw_swift.$(OBJEXT) \
 @WITH_RADOSGW_TRUE@	rgw/rgw_swift_auth.$(OBJEXT) \
 @WITH_RADOSGW_TRUE@	rgw/rgw_loadgen.$(OBJEXT) \
- at WITH_RADOSGW_TRUE@	rgw/rgw_civetweb.$(OBJEXT) \
- at WITH_RADOSGW_TRUE@	civetweb/src/radosgw-civetweb.$(OBJEXT) \
 @WITH_RADOSGW_TRUE@	rgw/rgw_main.$(OBJEXT)
 radosgw_OBJECTS = $(am_radosgw_OBJECTS)
- at WITH_RADOSGW_TRUE@radosgw_DEPENDENCIES = $(LIBRGW) \
+ at WITH_RADOSGW_TRUE@radosgw_DEPENDENCIES = $(LIBRGW) $(LIBCIVETWEB) \
 @WITH_RADOSGW_TRUE@	$(am__DEPENDENCIES_12) \
 @WITH_RADOSGW_TRUE@	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_6)
 am__radosgw_admin_SOURCES_DIST = rgw/rgw_admin.cc
@@ -1876,7 +1896,7 @@ am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/bloom_filter.cc common/linux_version.c mon/MonCap.cc \
 	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
 	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
-	mds/inode_backtrace.cc mds/mdstypes.cc
+	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
 am__objects_15 = test_build_libcommon-ceph_ver.$(OBJEXT) \
 	common/test_build_libcommon-DecayCounter.$(OBJEXT) \
 	common/test_build_libcommon-LogClient.$(OBJEXT) \
@@ -1958,7 +1978,8 @@ am__objects_15 = test_build_libcommon-ceph_ver.$(OBJEXT) \
 	osd/test_build_libcommon-HitSet.$(OBJEXT) \
 	mds/test_build_libcommon-MDSMap.$(OBJEXT) \
 	mds/test_build_libcommon-inode_backtrace.$(OBJEXT) \
-	mds/test_build_libcommon-mdstypes.$(OBJEXT)
+	mds/test_build_libcommon-mdstypes.$(OBJEXT) \
+	mds/test_build_libcommon-flock.$(OBJEXT)
 @WITH_BUILD_TESTS_TRUE at am_test_build_libcommon_OBJECTS = test/test_build_libcommon-buildtest_skeleton.$(OBJEXT) \
 @WITH_BUILD_TESTS_TRUE@	$(am__objects_15)
 test_build_libcommon_OBJECTS = $(am_test_build_libcommon_OBJECTS)
@@ -2356,6 +2377,15 @@ unittest_hitset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_hitset_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am_unittest_io_priority_OBJECTS =  \
+	test/common/unittest_io_priority-test_io_priority.$(OBJEXT)
+unittest_io_priority_OBJECTS = $(am_unittest_io_priority_OBJECTS)
+unittest_io_priority_DEPENDENCIES = $(am__DEPENDENCIES_13) \
+	$(am__DEPENDENCIES_6)
+unittest_io_priority_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_io_priority_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am_unittest_ipaddr_OBJECTS =  \
 	test/unittest_ipaddr-test_ipaddr.$(OBJEXT)
 unittest_ipaddr_OBJECTS = $(am_unittest_ipaddr_OBJECTS)
@@ -2668,11 +2698,12 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(libcls_version_client_a_SOURCES) $(libos_zfs_a_SOURCES) \
 	$(libarch_la_SOURCES) $(libauth_la_SOURCES) \
 	$(libcephfs_la_SOURCES) $(libcephfs_jni_la_SOURCES) \
-	$(libclient_la_SOURCES) $(libclient_fuse_la_SOURCES) \
-	$(libcls_hello_la_SOURCES) $(libcls_kvs_la_SOURCES) \
-	$(libcls_lock_la_SOURCES) $(libcls_lock_client_la_SOURCES) \
-	$(libcls_log_la_SOURCES) $(libcls_rbd_la_SOURCES) \
-	$(libcls_rbd_client_la_SOURCES) $(libcls_refcount_la_SOURCES) \
+	$(libcivetweb_la_SOURCES) $(libclient_la_SOURCES) \
+	$(libclient_fuse_la_SOURCES) $(libcls_hello_la_SOURCES) \
+	$(libcls_kvs_la_SOURCES) $(libcls_lock_la_SOURCES) \
+	$(libcls_lock_client_la_SOURCES) $(libcls_log_la_SOURCES) \
+	$(libcls_rbd_la_SOURCES) $(libcls_rbd_client_la_SOURCES) \
+	$(libcls_refcount_la_SOURCES) \
 	$(libcls_refcount_client_la_SOURCES) \
 	$(libcls_replica_log_la_SOURCES) $(libcls_rgw_la_SOURCES) \
 	$(libcls_rgw_client_la_SOURCES) $(libcls_statelog_la_SOURCES) \
@@ -2707,6 +2738,7 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_syn_SOURCES) $(ceph_bench_log_SOURCES) \
 	$(ceph_dupstore_SOURCES) $(ceph_erasure_code_SOURCES) \
 	$(ceph_erasure_code_benchmark_SOURCES) \
+	$(ceph_erasure_code_non_regression_SOURCES) \
 	$(ceph_filestore_dump_SOURCES) $(ceph_filestore_tool_SOURCES) \
 	$(ceph_kvstorebench_SOURCES) \
 	$(ceph_mon_store_converter_SOURCES) \
@@ -2794,8 +2826,8 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(unittest_escape_SOURCES) $(unittest_flatindex_SOURCES) \
 	$(unittest_formatter_SOURCES) $(unittest_gather_SOURCES) \
 	$(unittest_heartbeatmap_SOURCES) $(unittest_histogram_SOURCES) \
-	$(unittest_hitset_SOURCES) $(unittest_ipaddr_SOURCES) \
-	$(unittest_lfnindex_SOURCES) \
+	$(unittest_hitset_SOURCES) $(unittest_io_priority_SOURCES) \
+	$(unittest_ipaddr_SOURCES) $(unittest_lfnindex_SOURCES) \
 	$(unittest_libcephfs_config_SOURCES) \
 	$(unittest_librados_SOURCES) \
 	$(unittest_librados_config_SOURCES) $(unittest_log_SOURCES) \
@@ -2821,7 +2853,8 @@ DIST_SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(libcls_version_client_a_SOURCES) \
 	$(am__libos_zfs_a_SOURCES_DIST) $(libarch_la_SOURCES) \
 	$(libauth_la_SOURCES) $(libcephfs_la_SOURCES) \
-	$(am__libcephfs_jni_la_SOURCES_DIST) $(libclient_la_SOURCES) \
+	$(am__libcephfs_jni_la_SOURCES_DIST) \
+	$(am__libcivetweb_la_SOURCES_DIST) $(libclient_la_SOURCES) \
 	$(am__libclient_fuse_la_SOURCES_DIST) \
 	$(libcls_hello_la_SOURCES) $(am__libcls_kvs_la_SOURCES_DIST) \
 	$(libcls_lock_la_SOURCES) $(libcls_lock_client_la_SOURCES) \
@@ -2862,6 +2895,7 @@ DIST_SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_bench_log_SOURCES) $(ceph_dupstore_SOURCES) \
 	$(ceph_erasure_code_SOURCES) \
 	$(ceph_erasure_code_benchmark_SOURCES) \
+	$(ceph_erasure_code_non_regression_SOURCES) \
 	$(ceph_filestore_dump_SOURCES) $(ceph_filestore_tool_SOURCES) \
 	$(am__ceph_kvstorebench_SOURCES_DIST) \
 	$(ceph_mon_store_converter_SOURCES) \
@@ -2955,8 +2989,8 @@ DIST_SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(unittest_escape_SOURCES) $(unittest_flatindex_SOURCES) \
 	$(unittest_formatter_SOURCES) $(unittest_gather_SOURCES) \
 	$(unittest_heartbeatmap_SOURCES) $(unittest_histogram_SOURCES) \
-	$(unittest_hitset_SOURCES) $(unittest_ipaddr_SOURCES) \
-	$(unittest_lfnindex_SOURCES) \
+	$(unittest_hitset_SOURCES) $(unittest_io_priority_SOURCES) \
+	$(unittest_ipaddr_SOURCES) $(unittest_lfnindex_SOURCES) \
 	$(unittest_libcephfs_config_SOURCES) \
 	$(unittest_librados_SOURCES) \
 	$(unittest_librados_config_SOURCES) $(unittest_log_SOURCES) \
@@ -3219,8 +3253,9 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/neon.h arch/probe.h \
 	rgw/rgw_rest_log.h rgw/rgw_rest_opstate.h \
 	rgw/rgw_rest_replica_log.h rgw/rgw_rest_config.h \
 	rgw/rgw_usage.h rgw/rgw_user.h rgw/rgw_bucket.h \
-	rgw/rgw_keystone.h rgw/rgw_civetweb.h civetweb/civetweb.h \
-	civetweb/include/civetweb.h civetweb/src/md5.h \
+	rgw/rgw_keystone.h rgw/rgw_civetweb.h rgw/rgw_civetweb_log.h \
+	civetweb/civetweb.h civetweb/include/civetweb.h \
+	civetweb/include/civetweb_conf.h civetweb/src/md5.h \
 	cls/lock/cls_lock_types.h cls/lock/cls_lock_ops.h \
 	cls/lock/cls_lock_client.h cls/rbd/cls_rbd.h \
 	cls/rbd/cls_rbd_client.h cls/refcount/cls_refcount_ops.h \
@@ -3485,7 +3520,7 @@ top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 AUTOMAKE_OPTIONS = gnu subdir-objects
-SUBDIRS = ocf java $(am__append_61)
+SUBDIRS = ocf java $(am__append_62)
 DIST_SUBDIRS = gtest ocf libs3 java
 BUILT_SOURCES = init-ceph
 
@@ -3763,8 +3798,9 @@ noinst_HEADERS = arch/intel.h arch/neon.h arch/probe.h \
 	rgw/rgw_rest_log.h rgw/rgw_rest_opstate.h \
 	rgw/rgw_rest_replica_log.h rgw/rgw_rest_config.h \
 	rgw/rgw_usage.h rgw/rgw_user.h rgw/rgw_bucket.h \
-	rgw/rgw_keystone.h rgw/rgw_civetweb.h civetweb/civetweb.h \
-	civetweb/include/civetweb.h civetweb/src/md5.h \
+	rgw/rgw_keystone.h rgw/rgw_civetweb.h rgw/rgw_civetweb_log.h \
+	civetweb/civetweb.h civetweb/include/civetweb.h \
+	civetweb/include/civetweb_conf.h civetweb/src/md5.h \
 	cls/lock/cls_lock_types.h cls/lock/cls_lock_ops.h \
 	cls/lock/cls_lock_client.h cls/rbd/cls_rbd.h \
 	cls/rbd/cls_rbd_client.h cls/refcount/cls_refcount_ops.h \
@@ -3819,7 +3855,7 @@ bin_SCRIPTS = brag/client/ceph-brag ceph ceph-run ceph-rest-api \
 sbin_SCRIPTS = 
 su_sbin_SCRIPTS = mount.fuse.ceph mkcephfs
 dist_bin_SCRIPTS = 
-lib_LTLIBRARIES = librados.la librbd.la libcephfs.la $(am__append_65)
+lib_LTLIBRARIES = librados.la librbd.la libcephfs.la $(am__append_66)
 noinst_LTLIBRARIES = libarch.la libauth.la libcrush.la libmon_types.la \
 	libmon.la libmds.la libos_types.la libos.la libosd_types.la \
 	libosd.la liberasure_code.la libosdc.la libclient.la \
@@ -3827,7 +3863,7 @@ noinst_LTLIBRARIES = libarch.la libauth.la libcrush.la libmon_types.la \
 	libperfglue.la libcommon_crc.la libcommon.la libmsg.la \
 	$(am__append_35) libcls_lock_client.la \
 	libcls_refcount_client.la libcls_rgw_client.la \
-	libcls_rbd_client.la $(am__append_50) libradostest.la
+	libcls_rbd_client.la $(am__append_51) libradostest.la
 noinst_LIBRARIES = $(am__append_17) libcls_version_client.a \
 	libcls_log_client.a libcls_statelog_client.a \
 	libcls_replica_log_client.a libcls_user_client.a
@@ -3842,14 +3878,14 @@ bin_DEBUGPROGRAMS = ceph_test_ioctls $(am__append_38) \
 	ceph_test_signal_handlers ceph_test_rados ceph_test_mutate \
 	ceph_test_rewrite_latency ceph_test_msgr ceph_streamtest \
 	ceph_test_trans ceph_test_crypto ceph_test_keys \
-	$(am__append_48) ceph_smalliobench ceph_smalliobenchfs \
+	$(am__append_49) ceph_smalliobench ceph_smalliobenchfs \
 	ceph_smalliobenchdumb ceph_smalliobenchrbd ceph_tpbench \
-	ceph_omapbench $(am__append_49) ceph_bench_log \
-	$(am__append_52) ceph_multi_stress_watch ceph_test_librbd \
-	$(am__append_53) ceph_test_cls_rbd ceph_test_cls_refcount \
+	ceph_omapbench $(am__append_50) ceph_bench_log \
+	$(am__append_53) ceph_multi_stress_watch ceph_test_librbd \
+	$(am__append_54) ceph_test_cls_rbd ceph_test_cls_refcount \
 	ceph_test_cls_version ceph_test_cls_log ceph_test_cls_statelog \
 	ceph_test_cls_replica_log ceph_test_cls_lock \
-	ceph_test_cls_hello $(am__append_54) ceph_test_mon_workloadgen \
+	ceph_test_cls_hello $(am__append_55) ceph_test_mon_workloadgen \
 	ceph_test_rados_api_cmd ceph_test_rados_api_io \
 	ceph_test_rados_api_c_write_operations \
 	ceph_test_rados_api_c_read_operations ceph_test_rados_api_aio \
@@ -3857,7 +3893,7 @@ bin_DEBUGPROGRAMS = ceph_test_ioctls $(am__append_38) \
 	ceph_test_rados_api_stat ceph_test_rados_api_watch_notify \
 	ceph_test_rados_api_snapshots ceph_test_rados_api_cls \
 	ceph_test_rados_api_misc ceph_test_rados_api_tier \
-	ceph_test_rados_api_lock ceph_test_libcephfs $(am__append_55) \
+	ceph_test_rados_api_lock ceph_test_libcephfs $(am__append_56) \
 	ceph_test_objectstore_workloadgen \
 	ceph_test_filestore_idempotent \
 	ceph_test_filestore_idempotent_sequence ceph_xattr_bench \
@@ -3951,6 +3987,7 @@ LIBCLIENT = libclient.la
 LIBCLIENT_FUSE = libclient_fuse.la
 LIBRADOS = librados.la
 LIBRGW = librgw.la
+LIBCIVETWEB = libcivetweb.la
 LIBRBD = librbd.la
 LIBCEPHFS = libcephfs.la
 LIBERASURE_CODE = liberasure_code.la
@@ -4031,7 +4068,6 @@ libmds_la_SOURCES = \
 	mds/Dumper.cc \
 	mds/Resetter.cc \
 	mds/MDS.cc \
-	mds/flock.cc \
 	mds/locks.c \
 	mds/journal.cc \
 	mds/Server.cc \
@@ -4265,7 +4301,7 @@ libcommon_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 	common/bloom_filter.cc common/linux_version.c mon/MonCap.cc \
 	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
 	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
-	mds/inode_backtrace.cc mds/mdstypes.cc
+	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
 
 # inject crc in common
 libcommon_crc_la_SOURCES = common/sctp_crc32.c common/crc32c.cc \
@@ -4365,6 +4401,14 @@ librbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 $(am__append_34)
 @WITH_RADOSGW_TRUE@	rgw/rgw_dencoder.cc
 
 @WITH_RADOSGW_TRUE at librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS}
+ at WITH_RADOSGW_TRUE@CIVETWEB_INCLUDE = --include civetweb/include/civetweb_conf.h
+ at WITH_RADOSGW_TRUE@libcivetweb_la_SOURCES = \
+ at WITH_RADOSGW_TRUE@	rgw/rgw_civetweb.cc \
+ at WITH_RADOSGW_TRUE@	rgw/rgw_civetweb_log.cc \
+ at WITH_RADOSGW_TRUE@	civetweb/src/civetweb.c
+
+ at WITH_RADOSGW_TRUE@libcivetweb_la_CXXFLAGS = ${CIVETWEB_INCLUDE} -Woverloaded-virtual ${AM_CXXFLAGS}
+ at WITH_RADOSGW_TRUE@libcivetweb_la_CFLAGS = -Icivetweb/include ${CIVETWEB_INCLUDE}
 @WITH_RADOSGW_TRUE at radosgw_SOURCES = \
 @WITH_RADOSGW_TRUE@	rgw/rgw_resolve.cc \
 @WITH_RADOSGW_TRUE@	rgw/rgw_rest.cc \
@@ -4383,12 +4427,10 @@ librbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 $(am__append_34)
 @WITH_RADOSGW_TRUE@	rgw/rgw_swift.cc \
 @WITH_RADOSGW_TRUE@	rgw/rgw_swift_auth.cc \
 @WITH_RADOSGW_TRUE@	rgw/rgw_loadgen.cc \
- at WITH_RADOSGW_TRUE@	rgw/rgw_civetweb.cc \
- at WITH_RADOSGW_TRUE@	civetweb/src/civetweb.c \
 @WITH_RADOSGW_TRUE@	rgw/rgw_main.cc
 
- at WITH_RADOSGW_TRUE@radosgw_CFLAGS = -Icivetweb/include
- at WITH_RADOSGW_TRUE@radosgw_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
+ at WITH_RADOSGW_TRUE@radosgw_CFLAGS = -I$(srcdir)/civetweb/include
+ at WITH_RADOSGW_TRUE@radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
 @WITH_RADOSGW_TRUE at radosgw_admin_SOURCES = rgw/rgw_admin.cc
 @WITH_RADOSGW_TRUE at radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 @WITH_RADOSGW_TRUE at ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
@@ -4477,11 +4519,16 @@ ceph_erasure_code_benchmark_SOURCES = \
 
 ceph_erasure_code_benchmark_LDADD = $(LIBOSD) $(LIBCOMMON) \
 	$(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL) $(am__append_41)
+ceph_erasure_code_non_regression_SOURCES = \
+	test/erasure-code/ceph_erasure_code_non_regression.cc
+
+ceph_erasure_code_non_regression_LDADD = $(LIBOSD) $(LIBCOMMON) \
+	$(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL) $(am__append_42)
 ceph_erasure_code_SOURCES = \
 	test/erasure-code/ceph_erasure_code.cc
 
 ceph_erasure_code_LDADD = $(LIBOSD) $(LIBCOMMON) \
-	$(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL) $(am__append_42)
+	$(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL) $(am__append_43)
 libec_example_la_SOURCES = test/erasure-code/ErasureCodePluginExample.cc
 libec_example_la_CFLAGS = ${AM_CFLAGS}
 libec_example_la_CXXFLAGS = ${AM_CXXFLAGS}
@@ -4525,7 +4572,7 @@ libec_test_jerasure_generic_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*
 unittest_erasure_code_plugin_SOURCES = test/erasure-code/TestErasureCodePlugin.cc 
 unittest_erasure_code_plugin_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_erasure_code_plugin_LDADD = $(LIBOSD) $(LIBCOMMON) \
-	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(am__append_43)
+	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(am__append_44)
 unittest_erasure_code_jerasure_SOURCES = \
 	test/erasure-code/TestErasureCodeJerasure.cc \
 	${jerasure_sources}
@@ -4539,13 +4586,13 @@ unittest_erasure_code_jerasure_CXXFLAGS = $(UNITTEST_CXXFLAGS) \
 	-Ierasure-code/jerasure/jerasure/include
 
 unittest_erasure_code_jerasure_LDADD = $(LIBOSD) $(LIBCOMMON) \
-	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(am__append_44)
+	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(am__append_45)
 unittest_erasure_code_plugin_jerasure_SOURCES = \
 	test/erasure-code/TestErasureCodePluginJerasure.cc
 
 unittest_erasure_code_plugin_jerasure_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
 unittest_erasure_code_plugin_jerasure_LDADD = $(LIBOSD) $(LIBCOMMON) \
-	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(am__append_45)
+	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(am__append_46)
 unittest_erasure_code_example_SOURCES = test/erasure-code/TestErasureCodeExample.cc 
 unittest_erasure_code_example_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_erasure_code_example_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -4588,8 +4635,8 @@ ceph_dencoder_LDADD = \
 
 
 # These should always use explicit _CFLAGS/_CXXFLAGS so avoid basename conflicts
-ceph_dencoder_CFLAGS = ${AM_CFLAGS} $(am__append_46)
-ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS} $(am__append_47)
+ceph_dencoder_CFLAGS = ${AM_CFLAGS} $(am__append_47)
+ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS} $(am__append_48)
 get_command_descriptions_SOURCES = test/common/get_command_descriptions.cc
 get_command_descriptions_LDADD = $(LIBMON) $(LIBCOMMON) $(CEPH_GLOBAL)
 
@@ -4792,13 +4839,16 @@ unittest_osd_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_pglog_SOURCES = test/osd/TestPGLog.cc
 unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_pglog_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL) \
-	$(am__append_51)
+	$(am__append_52)
 unittest_ecbackend_SOURCES = test/osd/TestECBackend.cc
 unittest_ecbackend_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_ecbackend_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_hitset_SOURCES = test/osd/hitset.cc
 unittest_hitset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_hitset_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_io_priority_SOURCES = test/common/test_io_priority.cc
+unittest_io_priority_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_io_priority_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_gather_SOURCES = test/gather.cc
 unittest_gather_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_gather_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -5136,10 +5186,10 @@ ceph_kvstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
 ceph_kvstore_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 ceph_filestore_tool_SOURCES = tools/ceph_filestore_tool.cc
 ceph_filestore_tool_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) \
-	-lboost_program_options $(am__append_56)
+	-lboost_program_options $(am__append_57)
 ceph_filestore_dump_SOURCES = tools/ceph_filestore_dump.cc
 ceph_filestore_dump_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) \
-	$(BOOST_PROGRAM_OPTIONS_LIBS) $(am__append_57)
+	$(BOOST_PROGRAM_OPTIONS_LIBS) $(am__append_58)
 monmaptool_SOURCES = tools/monmaptool.cc
 monmaptool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
 crushtool_SOURCES = tools/crushtool.cc
@@ -5167,7 +5217,7 @@ rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(CEPH_GLOBAL)
 @WITH_REST_BENCH_TRUE@	common/obj_bencher.cc # needs cleanup so \
 @WITH_REST_BENCH_TRUE@	it can go in libcommon.la
 @WITH_REST_BENCH_TRUE at rest_bench_LDADD = $(CEPH_GLOBAL) \
- at WITH_REST_BENCH_TRUE@	$(am__append_59) $(am__append_60)
+ at WITH_REST_BENCH_TRUE@	$(am__append_60) $(am__append_61)
 @WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at rest_bench_CXXFLAGS = ${AM_CXXFLAGS} -I$(top_srcdir)/src/libs3/inc
 ceph_conf_SOURCES = tools/ceph_conf.cc
 ceph_conf_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
@@ -5600,6 +5650,26 @@ java/native/libcephfs_jni_la-JniConstants.lo:  \
 	java/native/$(DEPDIR)/$(am__dirstamp)
 libcephfs_jni.la: $(libcephfs_jni_la_OBJECTS) $(libcephfs_jni_la_DEPENDENCIES) $(EXTRA_libcephfs_jni_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcephfs_jni_la_LINK) $(am_libcephfs_jni_la_rpath) $(libcephfs_jni_la_OBJECTS) $(libcephfs_jni_la_LIBADD) $(LIBS)
+rgw/$(am__dirstamp):
+	@$(MKDIR_P) rgw
+	@: > rgw/$(am__dirstamp)
+rgw/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) rgw/$(DEPDIR)
+	@: > rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/libcivetweb_la-rgw_civetweb.lo: rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/libcivetweb_la-rgw_civetweb_log.lo: rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
+civetweb/src/$(am__dirstamp):
+	@$(MKDIR_P) civetweb/src
+	@: > civetweb/src/$(am__dirstamp)
+civetweb/src/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) civetweb/src/$(DEPDIR)
+	@: > civetweb/src/$(DEPDIR)/$(am__dirstamp)
+civetweb/src/libcivetweb_la-civetweb.lo: civetweb/src/$(am__dirstamp) \
+	civetweb/src/$(DEPDIR)/$(am__dirstamp)
+libcivetweb.la: $(libcivetweb_la_OBJECTS) $(libcivetweb_la_DEPENDENCIES) $(EXTRA_libcivetweb_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libcivetweb_la_LINK) $(am_libcivetweb_la_rpath) $(libcivetweb_la_OBJECTS) $(libcivetweb_la_LIBADD) $(LIBS)
 client/$(am__dirstamp):
 	@$(MKDIR_P) client
 	@: > client/$(am__dirstamp)
@@ -5907,6 +5977,7 @@ mds/MDSMap.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/inode_backtrace.lo: mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/mdstypes.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
+mds/flock.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 libcommon.la: $(libcommon_la_OBJECTS) $(libcommon_la_DEPENDENCIES) $(EXTRA_libcommon_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK)  $(libcommon_la_OBJECTS) $(libcommon_la_LIBADD) $(LIBS)
 common/libcommon_crc_la-sctp_crc32.lo: common/$(am__dirstamp) \
@@ -6232,7 +6303,6 @@ mds/Capability.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/Dumper.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/Resetter.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDS.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
-mds/flock.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/locks.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/journal.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/Server.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
@@ -6477,12 +6547,6 @@ librbd/WatchCtx.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd.la: $(librbd_la_OBJECTS) $(librbd_la_DEPENDENCIES) $(EXTRA_librbd_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(librbd_la_LINK) -rpath $(libdir) $(librbd_la_OBJECTS) $(librbd_la_LIBADD) $(LIBS)
-rgw/$(am__dirstamp):
-	@$(MKDIR_P) rgw
-	@: > rgw/$(am__dirstamp)
-rgw/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) rgw/$(DEPDIR)
-	@: > rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/librgw_la-librgw.lo: rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/librgw_la-rgw_acl.lo: rgw/$(am__dirstamp) \
@@ -6817,6 +6881,12 @@ test/erasure-code/ceph_erasure_code_benchmark.$(OBJEXT):  \
 ceph_erasure_code_benchmark$(EXEEXT): $(ceph_erasure_code_benchmark_OBJECTS) $(ceph_erasure_code_benchmark_DEPENDENCIES) $(EXTRA_ceph_erasure_code_benchmark_DEPENDENCIES) 
 	@rm -f ceph_erasure_code_benchmark$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(ceph_erasure_code_benchmark_OBJECTS) $(ceph_erasure_code_benchmark_LDADD) $(LIBS)
+test/erasure-code/ceph_erasure_code_non_regression.$(OBJEXT):  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+ceph_erasure_code_non_regression$(EXEEXT): $(ceph_erasure_code_non_regression_OBJECTS) $(ceph_erasure_code_non_regression_DEPENDENCIES) $(EXTRA_ceph_erasure_code_non_regression_DEPENDENCIES) 
+	@rm -f ceph_erasure_code_non_regression$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(ceph_erasure_code_non_regression_OBJECTS) $(ceph_erasure_code_non_regression_LDADD) $(LIBS)
 tools/ceph_filestore_dump.$(OBJEXT): tools/$(am__dirstamp) \
 	tools/$(DEPDIR)/$(am__dirstamp)
 ceph_filestore_dump$(EXEEXT): $(ceph_filestore_dump_OBJECTS) $(ceph_filestore_dump_DEPENDENCIES) $(EXTRA_ceph_filestore_dump_DEPENDENCIES) 
@@ -7533,16 +7603,6 @@ rgw/rgw_swift_auth.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/rgw_loadgen.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
-rgw/rgw_civetweb.$(OBJEXT): rgw/$(am__dirstamp) \
-	rgw/$(DEPDIR)/$(am__dirstamp)
-civetweb/src/$(am__dirstamp):
-	@$(MKDIR_P) civetweb/src
-	@: > civetweb/src/$(am__dirstamp)
-civetweb/src/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) civetweb/src/$(DEPDIR)
-	@: > civetweb/src/$(DEPDIR)/$(am__dirstamp)
-civetweb/src/radosgw-civetweb.$(OBJEXT): civetweb/src/$(am__dirstamp) \
-	civetweb/src/$(DEPDIR)/$(am__dirstamp)
 rgw/rgw_main.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 radosgw$(EXEEXT): $(radosgw_OBJECTS) $(radosgw_DEPENDENCIES) $(EXTRA_radosgw_DEPENDENCIES) 
@@ -7753,6 +7813,8 @@ mds/test_build_libcommon-inode_backtrace.$(OBJEXT):  \
 	mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/test_build_libcommon-mdstypes.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
+mds/test_build_libcommon-flock.$(OBJEXT): mds/$(am__dirstamp) \
+	mds/$(DEPDIR)/$(am__dirstamp)
 test_build_libcommon$(EXEEXT): $(test_build_libcommon_OBJECTS) $(test_build_libcommon_DEPENDENCIES) $(EXTRA_test_build_libcommon_DEPENDENCIES) 
 	@rm -f test_build_libcommon$(EXEEXT)
 	$(AM_V_CXXLD)$(test_build_libcommon_LINK) $(test_build_libcommon_OBJECTS) $(test_build_libcommon_LDADD) $(LIBS)
@@ -8082,6 +8144,12 @@ test/osd/unittest_hitset-hitset.$(OBJEXT): test/osd/$(am__dirstamp) \
 unittest_hitset$(EXEEXT): $(unittest_hitset_OBJECTS) $(unittest_hitset_DEPENDENCIES) $(EXTRA_unittest_hitset_DEPENDENCIES) 
 	@rm -f unittest_hitset$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_hitset_LINK) $(unittest_hitset_OBJECTS) $(unittest_hitset_LDADD) $(LIBS)
+test/common/unittest_io_priority-test_io_priority.$(OBJEXT):  \
+	test/common/$(am__dirstamp) \
+	test/common/$(DEPDIR)/$(am__dirstamp)
+unittest_io_priority$(EXEEXT): $(unittest_io_priority_OBJECTS) $(unittest_io_priority_DEPENDENCIES) $(EXTRA_unittest_io_priority_DEPENDENCIES) 
+	@rm -f unittest_io_priority$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_io_priority_LINK) $(unittest_io_priority_OBJECTS) $(unittest_io_priority_LDADD) $(LIBS)
 test/unittest_ipaddr-test_ipaddr.$(OBJEXT): test/$(am__dirstamp) \
 	test/$(DEPDIR)/$(am__dirstamp)
 unittest_ipaddr$(EXEEXT): $(unittest_ipaddr_OBJECTS) $(unittest_ipaddr_DEPENDENCIES) $(EXTRA_unittest_ipaddr_DEPENDENCIES) 
@@ -8472,7 +8540,8 @@ mostlyclean-compile:
 	-rm -f auth/none/AuthNoneAuthorizeHandler.lo
 	-rm -f auth/unknown/AuthUnknownAuthorizeHandler.$(OBJEXT)
 	-rm -f auth/unknown/AuthUnknownAuthorizeHandler.lo
-	-rm -f civetweb/src/radosgw-civetweb.$(OBJEXT)
+	-rm -f civetweb/src/libcivetweb_la-civetweb.$(OBJEXT)
+	-rm -f civetweb/src/libcivetweb_la-civetweb.lo
 	-rm -f client/Client.$(OBJEXT)
 	-rm -f client/Client.lo
 	-rm -f client/ClientSnapRealm.$(OBJEXT)
@@ -9032,6 +9101,7 @@ mostlyclean-compile:
 	-rm -f mds/snap.$(OBJEXT)
 	-rm -f mds/snap.lo
 	-rm -f mds/test_build_libcommon-MDSMap.$(OBJEXT)
+	-rm -f mds/test_build_libcommon-flock.$(OBJEXT)
 	-rm -f mds/test_build_libcommon-inode_backtrace.$(OBJEXT)
 	-rm -f mds/test_build_libcommon-mdstypes.$(OBJEXT)
 	-rm -f mon/AuthMonitor.$(OBJEXT)
@@ -9209,6 +9279,10 @@ mostlyclean-compile:
 	-rm -f rgw/ceph_dencoder-rgw_dencoder.$(OBJEXT)
 	-rm -f rgw/ceph_dencoder-rgw_env.$(OBJEXT)
 	-rm -f rgw/ceph_dencoder-rgw_json_enc.$(OBJEXT)
+	-rm -f rgw/libcivetweb_la-rgw_civetweb.$(OBJEXT)
+	-rm -f rgw/libcivetweb_la-rgw_civetweb.lo
+	-rm -f rgw/libcivetweb_la-rgw_civetweb_log.$(OBJEXT)
+	-rm -f rgw/libcivetweb_la-rgw_civetweb_log.lo
 	-rm -f rgw/librgw_la-librgw.$(OBJEXT)
 	-rm -f rgw/librgw_la-librgw.lo
 	-rm -f rgw/librgw_la-rgw_acl.$(OBJEXT)
@@ -9278,7 +9352,6 @@ mostlyclean-compile:
 	-rm -f rgw/librgw_la-rgw_xml.$(OBJEXT)
 	-rm -f rgw/librgw_la-rgw_xml.lo
 	-rm -f rgw/rgw_admin.$(OBJEXT)
-	-rm -f rgw/rgw_civetweb.$(OBJEXT)
 	-rm -f rgw/rgw_common.$(OBJEXT)
 	-rm -f rgw/rgw_env.$(OBJEXT)
 	-rm -f rgw/rgw_http_client.$(OBJEXT)
@@ -9380,6 +9453,7 @@ mostlyclean-compile:
 	-rm -f test/common/unittest_context-test_context.$(OBJEXT)
 	-rm -f test/common/unittest_crc32c-test_crc32c.$(OBJEXT)
 	-rm -f test/common/unittest_histogram-histogram.$(OBJEXT)
+	-rm -f test/common/unittest_io_priority-test_io_priority.$(OBJEXT)
 	-rm -f test/common/unittest_sharedptr_registry-test_sharedptr_registry.$(OBJEXT)
 	-rm -f test/common/unittest_sloppy_crc_map-test_sloppy_crc_map.$(OBJEXT)
 	-rm -f test/common/unittest_str_map-test_str_map.$(OBJEXT)
@@ -9390,6 +9464,7 @@ mostlyclean-compile:
 	-rm -f test/encoding/ceph_dencoder-ceph_dencoder.$(OBJEXT)
 	-rm -f test/erasure-code/ceph_erasure_code.$(OBJEXT)
 	-rm -f test/erasure-code/ceph_erasure_code_benchmark.$(OBJEXT)
+	-rm -f test/erasure-code/ceph_erasure_code_non_regression.$(OBJEXT)
 	-rm -f test/erasure-code/libec_example_la-ErasureCodePluginExample.$(OBJEXT)
 	-rm -f test/erasure-code/libec_example_la-ErasureCodePluginExample.lo
 	-rm -f test/erasure-code/libec_fail_to_initialize_la-ErasureCodePluginFailToInitialize.$(OBJEXT)
@@ -9584,7 +9659,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at auth/cephx/$(DEPDIR)/CephxSessionHandler.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at auth/none/$(DEPDIR)/AuthNoneAuthorizeHandler.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at auth/unknown/$(DEPDIR)/AuthUnknownAuthorizeHandler.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at civetweb/src/$(DEPDIR)/radosgw-civetweb.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at civetweb/src/$(DEPDIR)/libcivetweb_la-civetweb.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/Client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/ClientSnapRealm.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/Dentry.Plo at am__quote@
@@ -9920,6 +9995,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/mdstypes.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/snap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/test_build_libcommon-MDSMap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/test_build_libcommon-flock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/test_build_libcommon-inode_backtrace.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/test_build_libcommon-mdstypes.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/AuthMonitor.Plo at am__quote@
@@ -10020,6 +10096,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_dencoder.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_env.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/ceph_dencoder-rgw_json_enc.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb_log.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-librgw.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_acl.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_acl_s3.Plo at am__quote@
@@ -10055,7 +10133,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_user.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_xml.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_admin.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_civetweb.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_common.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_env.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_http_client.Po at am__quote@
@@ -10206,6 +10283,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_context-test_context.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_crc32c-test_crc32c.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_histogram-histogram.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_sharedptr_registry-test_sharedptr_registry.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_sloppy_crc_map-test_sloppy_crc_map.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_str_map-test_str_map.Po at am__quote@
@@ -10216,6 +10294,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/encoding/$(DEPDIR)/ceph_dencoder-ceph_dencoder.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/ceph_erasure_code.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/ceph_erasure_code_benchmark.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/ceph_erasure_code_non_regression.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_example_la-ErasureCodePluginExample.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_fail_to_initialize_la-ErasureCodePluginFailToInitialize.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_fail_to_register_la-ErasureCodePluginFailToRegister.Plo at am__quote@
@@ -10377,6 +10456,13 @@ common/libcommon_crc_la-crc32c_intel_fast_zero_asm.lo: common/crc32c_intel_fast_
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LTCOMPILE) -c -o $@ $<
 
+civetweb/src/libcivetweb_la-civetweb.lo: civetweb/src/civetweb.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CFLAGS) $(CFLAGS) -MT civetweb/src/libcivetweb_la-civetweb.lo -MD -MP -MF civetweb/src/$(DEPDIR)/libcivetweb_la-civetweb.Tpo -c -o civetweb/src/libcivetweb_la-civetweb.lo `test -f 'civetweb/src/civetweb.c' || echo '$(srcdir)/'`civetweb/src/civetweb.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) civetweb/src/$(DEPDIR)/libcivetweb_la-civetweb.Tpo civetweb/src/$(DEPDIR)/libcivetweb_la-civetweb.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='civetweb/src/civetweb.c' object='civetweb/src/libcivetweb_la-civetweb.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CFLAGS) $(CFLAGS) -c -o civetweb/src/libcivetweb_la-civetweb.lo `test -f 'civetweb/src/civetweb.c' || echo '$(srcdir)/'`civetweb/src/civetweb.c
+
 common/libcommon_crc_la-sctp_crc32.lo: common/sctp_crc32.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT common/libcommon_crc_la-sctp_crc32.lo -MD -MP -MF common/$(DEPDIR)/libcommon_crc_la-sctp_crc32.Tpo -c -o common/libcommon_crc_la-sctp_crc32.lo `test -f 'common/sctp_crc32.c' || echo '$(srcdir)/'`common/sctp_crc32.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/libcommon_crc_la-sctp_crc32.Tpo common/$(DEPDIR)/libcommon_crc_la-sctp_crc32.Plo
@@ -10748,20 +10834,6 @@ test/librbd/ceph_test_librbd_fsx-fsx.obj: test/librbd/fsx.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_librbd_fsx_CFLAGS) $(CFLAGS) -c -o test/librbd/ceph_test_librbd_fsx-fsx.obj `if test -f 'test/librbd/fsx.c'; then $(CYGPATH_W) 'test/librbd/fsx.c'; else $(CYGPATH_W) '$(srcdir)/test/librbd/fsx.c'; fi`
 
-civetweb/src/radosgw-civetweb.o: civetweb/src/civetweb.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(radosgw_CFLAGS) $(CFLAGS) -MT civetweb/src/radosgw-civetweb.o -MD -MP -MF civetweb/src/$(DEPDIR)/radosgw-civetweb.Tpo -c -o civetweb/src/radosgw-civetweb.o `test -f 'civetweb/src/civetweb.c' || echo '$(srcdir)/'`civetweb/src/civetweb.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) civetweb/src/$(DEPDIR)/radosgw-civetweb.Tpo civetweb/src/$(DEPDIR)/radosgw-civetweb.Po
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='civetweb/src/civetweb.c' object='civetweb/src/radosgw-civetweb.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(radosgw_CFLAGS) $(CFLAGS) -c -o civetweb/src/radosgw-civetweb.o `test -f 'civetweb/src/civetweb.c' || echo '$(srcdir)/'`civetweb/src/civetweb.c
-
-civetweb/src/radosgw-civetweb.obj: civetweb/src/civetweb.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(radosgw_CFLAGS) $(CFLAGS) -MT civetweb/src/radosgw-civetweb.obj -MD -MP -MF civetweb/src/$(DEPDIR)/radosgw-civetweb.Tpo -c -o civetweb/src/radosgw-civetweb.obj `if test -f 'civetweb/src/civetweb.c'; then $(CYGPATH_W) 'civetweb/src/civetweb.c'; else $(CYGPATH_W) '$(srcdir)/civetweb/src/civetweb.c'; fi`
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) civetweb/src/$(DEPDIR)/radosgw-civetweb.Tpo civetweb/src/$(DEPDIR)/radosgw-civetweb.Po
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='civetweb/src/civetweb.c' object='civetweb/src/radosgw-civetweb.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(radosgw_CFLAGS) $(CFLAGS) -c -o civetweb/src/radosgw-civetweb.obj `if test -f 'civetweb/src/civetweb.c'; then $(CYGPATH_W) 'civetweb/src/civetweb.c'; else $(CYGPATH_W) '$(srcdir)/civetweb/src/civetweb.c'; fi`
-
 test_build_libcommon-ceph_ver.o: ceph_ver.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CFLAGS) $(CFLAGS) -MT test_build_libcommon-ceph_ver.o -MD -MP -MF $(DEPDIR)/test_build_libcommon-ceph_ver.Tpo -c -o test_build_libcommon-ceph_ver.o `test -f 'ceph_ver.c' || echo '$(srcdir)/'`ceph_ver.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/test_build_libcommon-ceph_ver.Tpo $(DEPDIR)/test_build_libcommon-ceph_ver.Po
@@ -11178,6 +11250,20 @@ java/native/libcephfs_jni_la-JniConstants.lo: java/native/JniConstants.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libcephfs_jni_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o java/native/libcephfs_jni_la-JniConstants.lo `test -f 'java/native/JniConstants.cpp' || echo '$(srcdir)/'`java/native/JniConstants.cpp
 
+rgw/libcivetweb_la-rgw_civetweb.lo: rgw/rgw_civetweb.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CXXFLAGS) $(CXXFLAGS) -MT rgw/libcivetweb_la-rgw_civetweb.lo -MD -MP -MF rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb.Tpo -c -o rgw/libcivetweb_la-rgw_civetweb.lo `test -f 'rgw/rgw_civetweb.cc' || echo '$(srcdir)/'`rgw/rgw_civetweb.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb.Tpo rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_civetweb.cc' object='rgw/libcivetweb_la-rgw_civetweb.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/libcivetweb_la-rgw_civetweb.lo `test -f 'rgw/rgw_civetweb.cc' || echo '$(srcdir)/'`rgw/rgw_civetweb.cc
+
+rgw/libcivetweb_la-rgw_civetweb_log.lo: rgw/rgw_civetweb_log.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CXXFLAGS) $(CXXFLAGS) -MT rgw/libcivetweb_la-rgw_civetweb_log.lo -MD -MP -MF rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb_log.Tpo -c -o rgw/libcivetweb_la-rgw_civetweb_log.lo `test -f 'rgw/rgw_civetweb_log.cc' || echo '$(srcdir)/'`rgw/rgw_civetweb_log.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb_log.Tpo rgw/$(DEPDIR)/libcivetweb_la-rgw_civetweb_log.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_civetweb_log.cc' object='rgw/libcivetweb_la-rgw_civetweb_log.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/libcivetweb_la-rgw_civetweb_log.lo `test -f 'rgw/rgw_civetweb_log.cc' || echo '$(srcdir)/'`rgw/rgw_civetweb_log.cc
+
 common/libcommon_crc_la-crc32c.lo: common/crc32c.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT common/libcommon_crc_la-crc32c.lo -MD -MP -MF common/$(DEPDIR)/libcommon_crc_la-crc32c.Tpo -c -o common/libcommon_crc_la-crc32c.lo `test -f 'common/crc32c.cc' || echo '$(srcdir)/'`common/crc32c.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/libcommon_crc_la-crc32c.Tpo common/$(DEPDIR)/libcommon_crc_la-crc32c.Plo
@@ -13705,6 +13791,20 @@ mds/test_build_libcommon-mdstypes.obj: mds/mdstypes.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o mds/test_build_libcommon-mdstypes.obj `if test -f 'mds/mdstypes.cc'; then $(CYGPATH_W) 'mds/mdstypes.cc'; else $(CYGPATH_W) '$(srcdir)/mds/mdstypes.cc'; fi`
 
+mds/test_build_libcommon-flock.o: mds/flock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT mds/test_build_libcommon-flock.o -MD -MP -MF mds/$(DEPDIR)/test_build_libcommon-flock.Tpo -c -o mds/test_build_libcommon-flock.o `test -f 'mds/flock.cc' || echo '$(srcdir)/'`mds/flock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/test_build_libcommon-flock.Tpo mds/$(DEPDIR)/test_build_libcommon-flock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/flock.cc' object='mds/test_build_libcommon-flock.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o mds/test_build_libcommon-flock.o `test -f 'mds/flock.cc' || echo '$(srcdir)/'`mds/flock.cc
+
+mds/test_build_libcommon-flock.obj: mds/flock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT mds/test_build_libcommon-flock.obj -MD -MP -MF mds/$(DEPDIR)/test_build_libcommon-flock.Tpo -c -o mds/test_build_libcommon-flock.obj `if test -f 'mds/flock.cc'; then $(CYGPATH_W) 'mds/flock.cc'; else $(CYGPATH_W) '$(srcdir)/mds/flock.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/test_build_libcommon-flock.Tpo mds/$(DEPDIR)/test_build_libcommon-flock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/flock.cc' object='mds/test_build_libcommon-flock.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o mds/test_build_libcommon-flock.obj `if test -f 'mds/flock.cc'; then $(CYGPATH_W) 'mds/flock.cc'; else $(CYGPATH_W) '$(srcdir)/mds/flock.cc'; fi`
+
 test/test_build_librados-buildtest_skeleton.o: test/buildtest_skeleton.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librados_CXXFLAGS) $(CXXFLAGS) -MT test/test_build_librados-buildtest_skeleton.o -MD -MP -MF test/$(DEPDIR)/test_build_librados-buildtest_skeleton.Tpo -c -o test/test_build_librados-buildtest_skeleton.o `test -f 'test/buildtest_skeleton.cc' || echo '$(srcdir)/'`test/buildtest_skeleton.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/test_build_librados-buildtest_skeleton.Tpo test/$(DEPDIR)/test_build_librados-buildtest_skeleton.Po
@@ -14755,6 +14855,20 @@ test/osd/unittest_hitset-hitset.obj: test/osd/hitset.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_hitset_CXXFLAGS) $(CXXFLAGS) -c -o test/osd/unittest_hitset-hitset.obj `if test -f 'test/osd/hitset.cc'; then $(CYGPATH_W) 'test/osd/hitset.cc'; else $(CYGPATH_W) '$(srcdir)/test/osd/hitset.cc'; fi`
 
+test/common/unittest_io_priority-test_io_priority.o: test/common/test_io_priority.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_io_priority_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_io_priority-test_io_priority.o -MD -MP -MF test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Tpo -c -o test/common/unittest_io_priority-test_io_priority.o `test -f 'test/common/test_io_priority.cc' || echo '$(srcdir)/'`test/common/test_io_priority.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Tpo test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/common/test_io_priority.cc' object='test/common/unittest_io_priority-test_io_priority.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_io_priority_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_io_priority-test_io_priority.o `test -f 'test/common/test_io_priority.cc' || echo '$(srcdir)/'`test/common/test_io_priority.cc
+
+test/common/unittest_io_priority-test_io_priority.obj: test/common/test_io_priority.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_io_priority_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_io_priority-test_io_priority.obj -MD -MP -MF test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Tpo -c -o test/common/unittest_io_priority-test_io_priority.obj `if test -f 'test/common/test_io_priority.cc'; then $(CYGPATH_W) 'test/common/test_io_priority.cc'; else $(CYGPATH_W) '$(srcdir)/test/common/test_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Tpo test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/common/test_io_priority.cc' object='test/common/unittest_io_priority-test_io_priority.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_io_priority_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_io_priority-test_io_priority.obj `if test -f 'test/common/test_io_priority.cc'; then $(CYGPATH_W) 'test/common/test_io_priority.cc'; else $(CYGPATH_W) '$(srcdir)/test/common/test_io_priority.cc'; fi`
+
 test/unittest_ipaddr-test_ipaddr.o: test/test_ipaddr.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_ipaddr_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_ipaddr-test_ipaddr.o -MD -MP -MF test/$(DEPDIR)/unittest_ipaddr-test_ipaddr.Tpo -c -o test/unittest_ipaddr-test_ipaddr.o `test -f 'test/test_ipaddr.cc' || echo '$(srcdir)/'`test/test_ipaddr.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_ipaddr-test_ipaddr.Tpo test/$(DEPDIR)/unittest_ipaddr-test_ipaddr.Po
@@ -15195,6 +15309,7 @@ clean-libtool:
 	-rm -rf auth/cephx/.libs auth/cephx/_libs
 	-rm -rf auth/none/.libs auth/none/_libs
 	-rm -rf auth/unknown/.libs auth/unknown/_libs
+	-rm -rf civetweb/src/.libs civetweb/src/_libs
 	-rm -rf client/.libs client/_libs
 	-rm -rf cls/hello/.libs cls/hello/_libs
 	-rm -rf cls/lock/.libs cls/lock/_libs
@@ -15994,6 +16109,7 @@ $(shell_scripts): %: %.in
 docdir ?= ${datadir}/doc/ceph
 
 check-local:
+	$(top_srcdir)/qa/workunits/erasure-code/encode-decode-non-regression.sh 
 	$(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
 
 # base targets
diff --git a/src/ceph-disk b/src/ceph-disk
index 5d6071d..6bd0220 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -89,7 +89,7 @@ MOUNT_OPTIONS = dict(
     # issues with ext4 before the xatts-in-leveldb work, and it seemed
     # that user_xattr helped
     ext4='noatime,user_xattr',
-    xfs='noatime',
+    xfs='noatime,inode64',
     )
 
 MKFS_ARGS = dict(
@@ -791,11 +791,13 @@ def get_or_create_dmcrypt_key(
     # make a new key
     try:
         if not os.path.exists(key_dir):
-            os.makedirs(key_dir)
+            os.makedirs(key_dir, stat.S_IRUSR|stat.S_IWUSR|stat.S_IXUSR)
         with file('/dev/urandom', 'rb') as i:
             key = i.read(256)
-            with file(path, 'wb') as key_file:
-                key_file.write(key)
+            fd = os.open(path, os.O_WRONLY|os.O_CREAT,
+                         stat.S_IRUSR|stat.S_IWUSR)
+            assert os.write(fd, key) == len(key)
+            os.close(fd)
         return path
     except:
         raise Error('unable to read or create dm-crypt key', path)
@@ -968,6 +970,35 @@ def get_free_partition_index(dev):
     return num
 
 
+def update_partition(action, dev, description):
+     # try to make sure the kernel refreshes the table.  note
+     # that if this gets ebusy, we are probably racing with
+     # udev because it already updated it.. ignore failure here.
+
+     # On RHEL and CentOS distros, calling partprobe forces a reboot of the
+     # server. Since we are not resizing partitons so we rely on calling
+     # partx
+     if platform_distro().startswith(('centos', 'red', 'scientific')):
+         LOG.info('calling partx on %s device %s', description, dev)
+         LOG.info('re-reading known partitions will display errors')
+         command(
+             [
+                 'partx',
+                 action,
+                 dev,
+             ],
+         )
+
+     else:
+         LOG.debug('Calling partprobe on %s device %s', description, dev)
+         command(
+             [
+                 'partprobe',
+                 dev,
+             ],
+         )
+
+
 def zap(dev):
     """
     Destroy the partition table and content of a given disk.
@@ -993,6 +1024,9 @@ def zap(dev):
                 dev,
             ],
         )
+
+        update_partition('-d', dev, 'zapped')
+
     except subprocess.CalledProcessError as e:
         raise Error(e)
 
@@ -1068,32 +1102,7 @@ def prepare_journal_dev(
             ],
         )
 
-        # try to make sure the kernel refreshes the table.  note
-        # that if this gets ebusy, we are probably racing with
-        # udev because it already updated it.. ignore failure here.
-
-        # On RHEL and CentOS distros, calling partprobe forces a reboot of the
-        # server. Since we are not resizing partitons so we rely on calling
-        # partx
-        if platform_distro().startswith(('centos', 'red')):
-            LOG.info('calling partx on prepared device %s', journal)
-            LOG.info('re-reading known partitions will display errors')
-            command(
-                [
-                    'partx',
-                    '-a',
-                    journal,
-                    ],
-                )
-
-        else:
-            LOG.debug('Calling partprobe on prepared device %s', journal)
-            command(
-                [
-                    'partprobe',
-                    journal,
-                    ],
-                )
+        update_partition('-a', journal, 'prepared')
 
         # wait for udev event queue to clear
         command(
@@ -1118,7 +1127,6 @@ def prepare_journal_dev(
     except subprocess.CalledProcessError as e:
         raise Error(e)
 
-
 def prepare_journal_file(
     journal):
 
@@ -1279,12 +1287,7 @@ def prepare_dev(
                     data,
                 ],
             )
-            command(
-                [
-                    'partprobe',
-                    data,
-                    ],
-                )
+            update_partition('-a', data, 'created')
             command(
                 [
                     # wait for udev event queue to clear
@@ -1500,33 +1503,7 @@ def main_prepare(args):
         prepare_lock.release()  # noqa
 
         if stat.S_ISBLK(dmode):
-            # try to make sure the kernel refreshes the table.  note
-            # that if this gets ebusy, we are probably racing with
-            # udev because it already updated it.. ignore failure here.
-
-            # On RHEL and CentOS distros, calling partprobe forces a reboot of
-            # the server. Since we are not resizing partitons so we rely on
-            # calling partx
-            if platform_distro().startswith(('centos', 'red')):
-                LOG.info('calling partx on prepared device %s', args.data)
-                LOG.info('re-reading known partitions will display errors')
-
-                command(
-                    [
-                        'partx',
-                        '-a',
-                        args.data,
-                        ],
-                    )
-
-            else:
-                LOG.debug('Calling partprobe on prepared device %s', args.data)
-                command(
-                    [
-                        'partprobe',
-                        args.data,
-                        ],
-                    )
+            update_partition('-a', args.data, 'prepared')
 
     except Error as e:
         if journal_dm_keypath:
@@ -1918,7 +1895,8 @@ def activate(
         raise Error('No OSD uuid assigned.')
     LOG.debug('OSD uuid is %s', fsid)
 
-    keyring = activate_key_template.format(cluster=cluster)
+    keyring = activate_key_template.format(cluster=cluster,
+                                           statedir=STATEDIR)
 
     osd_id = get_osd_id(path)
     if osd_id is None:
@@ -2657,7 +2635,7 @@ def parse_args():
         help='path to block device or directory',
         )
     activate_parser.set_defaults(
-        activate_key_template=STATEDIR + '/bootstrap-osd/{cluster}.keyring',
+        activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate,
         )
 
@@ -2681,7 +2659,7 @@ def parse_args():
         choices=INIT_SYSTEMS,
         )
     activate_journal_parser.set_defaults(
-        activate_key_template=STATEDIR + '/bootstrap-osd/{cluster}.keyring',
+        activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate_journal,
         )
 
@@ -2700,7 +2678,7 @@ def parse_args():
         choices=INIT_SYSTEMS,
         )
     activate_all_parser.set_defaults(
-        activate_key_template=STATEDIR + '/bootstrap-osd/{cluster}.keyring',
+        activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate_all,
         )
 
diff --git a/src/ceph.in b/src/ceph.in
index 82c9085..c5b97ef 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -841,4 +841,8 @@ def main():
     return 0
 
 if __name__ == '__main__':
-    sys.exit(main())
+    retval = main()
+    # shutdown explicitly; Rados() does not
+    if cluster_handle:
+        cluster_handle.shutdown()
+    sys.exit(retval)
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 80b17a1..1b52f58 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -419,6 +419,26 @@ int main(int argc, const char **argv)
     return 0;
   }
 
+  {
+    // check fs stats. don't start if it's critically close to full.
+    ceph_data_stats_t stats;
+    int err = get_fs_stats(stats, g_conf->mon_data.c_str());
+    if (err < 0) {
+      cerr << "error checking monitor data's fs stats: " << cpp_strerror(err)
+           << std::endl;
+      exit(-err);
+    }
+    if (stats.avail_percent <= g_conf->mon_data_avail_crit) {
+      cerr << "error: monitor data filesystem reached concerning levels of"
+           << " available storage space (available: "
+           << stats.avail_percent << "% " << prettybyte_t(stats.byte_avail)
+           << ")\nyou may adjust 'mon data avail crit' to a lower value"
+           << " to make this go away (default: " << g_conf->mon_data_avail_crit
+           << "%)\n" << std::endl;
+      exit(ENOSPC);
+    }
+  }
+
   // we fork early to prevent leveldb's environment static state from
   // screwing us over
   Preforker prefork;
diff --git a/src/civetweb/civetweb.h b/src/civetweb/civetweb.h
index a6ca3e7..5da8a73 100644
--- a/src/civetweb/civetweb.h
+++ b/src/civetweb/civetweb.h
@@ -24,7 +24,7 @@
 #define CIVETWEB_HEADER_INCLUDED
 
 #ifndef CIVETWEB_VERSION
-#define CIVETWEB_VERSION "1.6"
+#define CIVETWEB_VERSION "1.7"
 #endif
 
 #ifndef CIVETWEB_API
@@ -77,7 +77,7 @@ struct mg_request_info {
 
 /* This structure needs to be passed to mg_start(), to let civetweb know
    which callbacks to invoke. For detailed description, see
-   https://github.com/sunsetbrew/civetweb/blob/master/docs/UserManual.md */
+   https://github.com/bel2125/civetweb/blob/master/docs/UserManual.md */
 struct mg_callbacks {
     /* Called when civetweb has received new HTTP request.
        If callback returns non-zero,
@@ -94,6 +94,10 @@ struct mg_callbacks {
        non-zero, civetweb does not log anything. */
     int  (*log_message)(const struct mg_connection *, const char *message);
 
+    /* Called when civetweb is about to log access. If callback returns
+       non-zero, civetweb does not log anything. */
+    int  (*log_access)(const struct mg_connection *, const char *message);
+
     /* Called when civetweb initializes SSL library. */
     int  (*init_ssl)(void *ssl_context, void *user_data);
 
@@ -176,7 +180,7 @@ struct mg_callbacks {
      };
      struct mg_context *ctx = mg_start(&my_func, NULL, options);
 
-   Refer to https://github.com/sunsetbrew/civetweb/blob/master/docs/UserManual.md
+   Refer to https://github.com/bel2125/civetweb/blob/master/docs/UserManual.md
    for the list of valid option and their possible values.
 
    Return:
@@ -330,8 +334,18 @@ CIVETWEB_API int mg_websocket_write(struct mg_connection* conn, int opcode,
    Invoke this before mg_write or mg_printf when communicating with a
    websocket if your code has server-initiated communication as well as
    communication in direct response to a message. */
-CIVETWEB_API void mg_lock(struct mg_connection* conn);
-CIVETWEB_API void mg_unlock(struct mg_connection* conn);
+CIVETWEB_API void mg_lock_connection(struct mg_connection* conn);
+CIVETWEB_API void mg_unlock_connection(struct mg_connection* conn);
+
+#if defined(MG_LEGACY_INTERFACE)
+#define mg_lock mg_lock_connection
+#define mg_unlock mg_unlock_connection
+#endif
+
+/* Lock server context.  This lock may be used to protect ressources
+   that are shared between different connection/worker threads. */
+CIVETWEB_API void mg_lock_context(struct mg_context* ctx);
+CIVETWEB_API void mg_unlock_context(struct mg_context* ctx);
 
 
 /* Opcodes, from http://tools.ietf.org/html/rfc6455 */
diff --git a/src/civetweb/include/civetweb.h b/src/civetweb/include/civetweb.h
index a6ca3e7..5da8a73 100644
--- a/src/civetweb/include/civetweb.h
+++ b/src/civetweb/include/civetweb.h
@@ -24,7 +24,7 @@
 #define CIVETWEB_HEADER_INCLUDED
 
 #ifndef CIVETWEB_VERSION
-#define CIVETWEB_VERSION "1.6"
+#define CIVETWEB_VERSION "1.7"
 #endif
 
 #ifndef CIVETWEB_API
@@ -77,7 +77,7 @@ struct mg_request_info {
 
 /* This structure needs to be passed to mg_start(), to let civetweb know
    which callbacks to invoke. For detailed description, see
-   https://github.com/sunsetbrew/civetweb/blob/master/docs/UserManual.md */
+   https://github.com/bel2125/civetweb/blob/master/docs/UserManual.md */
 struct mg_callbacks {
     /* Called when civetweb has received new HTTP request.
        If callback returns non-zero,
@@ -94,6 +94,10 @@ struct mg_callbacks {
        non-zero, civetweb does not log anything. */
     int  (*log_message)(const struct mg_connection *, const char *message);
 
+    /* Called when civetweb is about to log access. If callback returns
+       non-zero, civetweb does not log anything. */
+    int  (*log_access)(const struct mg_connection *, const char *message);
+
     /* Called when civetweb initializes SSL library. */
     int  (*init_ssl)(void *ssl_context, void *user_data);
 
@@ -176,7 +180,7 @@ struct mg_callbacks {
      };
      struct mg_context *ctx = mg_start(&my_func, NULL, options);
 
-   Refer to https://github.com/sunsetbrew/civetweb/blob/master/docs/UserManual.md
+   Refer to https://github.com/bel2125/civetweb/blob/master/docs/UserManual.md
    for the list of valid option and their possible values.
 
    Return:
@@ -330,8 +334,18 @@ CIVETWEB_API int mg_websocket_write(struct mg_connection* conn, int opcode,
    Invoke this before mg_write or mg_printf when communicating with a
    websocket if your code has server-initiated communication as well as
    communication in direct response to a message. */
-CIVETWEB_API void mg_lock(struct mg_connection* conn);
-CIVETWEB_API void mg_unlock(struct mg_connection* conn);
+CIVETWEB_API void mg_lock_connection(struct mg_connection* conn);
+CIVETWEB_API void mg_unlock_connection(struct mg_connection* conn);
+
+#if defined(MG_LEGACY_INTERFACE)
+#define mg_lock mg_lock_connection
+#define mg_unlock mg_unlock_connection
+#endif
+
+/* Lock server context.  This lock may be used to protect ressources
+   that are shared between different connection/worker threads. */
+CIVETWEB_API void mg_lock_context(struct mg_context* ctx);
+CIVETWEB_API void mg_unlock_context(struct mg_context* ctx);
 
 
 /* Opcodes, from http://tools.ietf.org/html/rfc6455 */
diff --git a/src/civetweb/include/civetweb_conf.h b/src/civetweb/include/civetweb_conf.h
new file mode 100644
index 0000000..578143f
--- /dev/null
+++ b/src/civetweb/include/civetweb_conf.h
@@ -0,0 +1,6 @@
+#ifndef CIVETWEB_CONF_H
+#define CIVETWEB_CONF_H
+
+#define USE_IPV6 1
+
+#endif
diff --git a/src/civetweb/src/civetweb.c b/src/civetweb/src/civetweb.c
index 4aa8a02..3567df3 100644
--- a/src/civetweb/src/civetweb.c
+++ b/src/civetweb/src/civetweb.c
@@ -20,6 +20,8 @@
  * THE SOFTWARE.
  */
 
+#define RGW 1
+
 #if defined(_WIN32)
 #if !defined(_CRT_SECURE_NO_WARNINGS)
 #define _CRT_SECURE_NO_WARNINGS /* Disable deprecation warning in VS2005 */
@@ -181,6 +183,10 @@ typedef long off_t;
 #define sleep(x) Sleep((x) * 1000)
 #define rmdir(x) _rmdir(x)
 
+#if defined(USE_LUA) && defined(USE_WEBSOCKET)
+#define USE_TIMERS
+#endif
+
 #if !defined(va_copy)
 #define va_copy(x, y) x = y
 #endif /* !va_copy MINGW #defines va_copy */
@@ -309,6 +315,40 @@ typedef int SOCKET;
 
 #endif /* End of Windows and UNIX specific includes */
 
+#ifdef _WIN32
+static CRITICAL_SECTION global_log_file_lock;
+static DWORD pthread_self(void)
+{
+    return GetCurrentThreadId();
+}
+
+int pthread_key_create(pthread_key_t *key, void (*_must_be_zero)(void*) /* destructor function not supported for windows */)
+{
+    assert(_must_be_zero == NULL);
+    if ((key!=0) && (_must_be_zero == NULL)) {
+        *key = TlsAlloc();
+        return (*key != TLS_OUT_OF_INDEXES) ? 0 : -1;
+    }
+    return -2;
+}
+
+int pthread_key_delete(pthread_key_t key)
+{
+    return TlsFree(key) ? 0 : 1;
+}
+
+int pthread_setspecific(pthread_key_t key, void * value)
+{
+    return TlsSetValue(key, value) ? 0 : 1;
+}
+
+void *pthread_getspecific(pthread_key_t key)
+{
+    return TlsGetValue(key);
+}
+#endif /* _WIN32 */
+
+
 #include "civetweb.h"
 
 #define PASSWORDS_FILE_NAME ".htpasswd"
@@ -320,23 +360,30 @@ typedef int SOCKET;
 #endif
 #define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
 
-#ifdef DEBUG_TRACE
-#undef DEBUG_TRACE
-#define DEBUG_TRACE(x)
-#else
+#if !defined(DEBUG_TRACE)
 #if defined(DEBUG)
-#define DEBUG_TRACE(x) do { \
-  flockfile(stdout); \
-  printf("*** %lu.%p.%s.%d: ", \
-         (unsigned long) time(NULL), (void *) pthread_self(), \
-         __func__, __LINE__); \
-  printf x; \
-  putchar('\n'); \
-  fflush(stdout); \
-  funlockfile(stdout); \
-} while (0)
+
+static void DEBUG_TRACE_FUNC(const char *func, unsigned line, PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(3, 4);
+
+static void DEBUG_TRACE_FUNC(const char *func, unsigned line, const char *fmt, ...) {
+
+  va_list args;
+  flockfile(stdout);
+  printf("*** %lu.%p.%s.%u: ",
+         (unsigned long) time(NULL), (void *) pthread_self(),
+         func, line);
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+  putchar('\n');
+  fflush(stdout);
+  funlockfile(stdout);
+}
+
+#define DEBUG_TRACE(fmt, ...) DEBUG_TRACE_FUNC(__func__, __LINE__, fmt, __VA_ARGS__)
+
 #else
-#define DEBUG_TRACE(x)
+#define DEBUG_TRACE(fmt, ...)
 #endif /* DEBUG */
 #endif /* DEBUG_TRACE */
 
@@ -357,7 +404,7 @@ static void * mg_malloc_ex(size_t size, const char * file, unsigned line) {
         memory = (void *)(((char*)data)+sizeof(size_t));
     }
 
-    sprintf(mallocStr, "MEM: %p %5u alloc   %7u %4u --- %s:%u\n", memory, size, totalMemUsed, blockCount, file, line);
+    sprintf(mallocStr, "MEM: %p %5lu alloc   %7lu %4lu --- %s:%u\n", memory, (unsigned long)size, totalMemUsed, blockCount, file, line);
 #if defined(_WIN32)
     OutputDebugStringA(mallocStr);
 #else
@@ -385,7 +432,7 @@ static void mg_free_ex(void * memory, const char * file, unsigned line) {
         size = *(size_t*)data;
         totalMemUsed -= size;
         blockCount--;
-        sprintf(mallocStr, "MEM: %p %5u free    %7u %4u --- %s:%u\n", memory, size, totalMemUsed, blockCount, file, line);
+        sprintf(mallocStr, "MEM: %p %5lu free    %7lu %4lu --- %s:%u\n", memory, (unsigned long)size, totalMemUsed, blockCount, file, line);
 #if defined(_WIN32)
         OutputDebugStringA(mallocStr);
 #else
@@ -400,23 +447,25 @@ static void * mg_realloc_ex(void * memory, size_t newsize, const char * file, un
 
     char mallocStr[256];
     void * data;
+    void * _realloc;
     size_t oldsize;
 
     if (newsize) {
         if (memory) {
             data = (void *)(((char*)memory)-sizeof(size_t));
             oldsize = *(size_t*)data;
-            data = realloc(data, newsize+sizeof(size_t));
-            if (data) {
+            _realloc = realloc(data, newsize+sizeof(size_t));
+            if (_realloc) {
+                data = _realloc;
                 totalMemUsed -= oldsize;
-                sprintf(mallocStr, "MEM: %p %5u r-free  %7u %4u --- %s:%u\n", memory, oldsize, totalMemUsed, blockCount, file, line);
+                sprintf(mallocStr, "MEM: %p %5lu r-free  %7lu %4lu --- %s:%u\n", memory, (unsigned long)oldsize, totalMemUsed, blockCount, file, line);
 #if defined(_WIN32)
                 OutputDebugStringA(mallocStr);
 #else
                 DEBUG_TRACE("%s", mallocStr);
 #endif
                 totalMemUsed += newsize;
-                sprintf(mallocStr, "MEM: %p %5u r-alloc %7u %4u --- %s:%u\n", memory, newsize, totalMemUsed, blockCount, file, line);
+                sprintf(mallocStr, "MEM: %p %5lu r-alloc %7lu %4lu --- %s:%u\n", memory, (unsigned long)newsize, totalMemUsed, blockCount, file, line);
 #if defined(_WIN32)
                 OutputDebugStringA(mallocStr);
 #else
@@ -430,6 +479,7 @@ static void * mg_realloc_ex(void * memory, size_t newsize, const char * file, un
 #else
                 DEBUG_TRACE("MEM: realloc failed\n");
 #endif
+                return _realloc;
             }
         } else {
             data = mg_malloc_ex(newsize, file, line);
@@ -454,43 +504,24 @@ static __inline void * mg_realloc(void * a, size_t b)  {return realloc(a, b);}
 static __inline void   mg_free(void * a)               {free(a);}
 #endif
 
+/* This following lines are just meant as a reminder to use the mg-functions for memory management */
+#ifdef malloc
+    #undef malloc
+#endif
+#ifdef calloc
+    #undef calloc
+#endif
+#ifdef realloc
+    #undef realloc
+#endif
+#ifdef free
+    #undef free
+#endif
 #define malloc  DO_NOT_USE_THIS_FUNCTION__USE_mg_malloc
 #define calloc  DO_NOT_USE_THIS_FUNCTION__USE_mg_calloc
 #define realloc DO_NOT_USE_THIS_FUNCTION__USE_mg_realloc
 #define free    DO_NOT_USE_THIS_FUNCTION__USE_mg_free
 
-#ifdef _WIN32
-static CRITICAL_SECTION global_log_file_lock;
-static DWORD pthread_self(void)
-{
-    return GetCurrentThreadId();
-}
-
-int pthread_key_create(pthread_key_t *key, void (*_must_be_zero)(void*) /* destructor function not supported for windows */)
-{
-    assert(_must_be_zero == NULL);
-    if ((key!=0) && (_must_be_zero == NULL)) {
-        *key = TlsAlloc();
-        return (*key != TLS_OUT_OF_INDEXES) ? 0 : -1;
-    }
-    return -2;
-}
-
-int pthread_key_delete(pthread_key_t key)
-{
-    return TlsFree(key) ? 0 : 1;
-}
-
-int pthread_setspecific(pthread_key_t key, void * value)
-{
-    return TlsSetValue(key, value) ? 0 : 1;
-}
-
-void *pthread_getspecific(pthread_key_t key)
-{
-    return TlsGetValue(key);
-}
-#endif /* _WIN32 */
 
 #define MD5_STATIC static
 #include "md5.h"
@@ -668,6 +699,7 @@ enum {
     GLOBAL_PASSWORDS_FILE, INDEX_FILES, ENABLE_KEEP_ALIVE, ACCESS_CONTROL_LIST,
     EXTRA_MIME_TYPES, LISTENING_PORTS, DOCUMENT_ROOT, SSL_CERTIFICATE,
     NUM_THREADS, RUN_AS_USER, REWRITE, HIDE_FILES, REQUEST_TIMEOUT,
+    DECODE_URL,
 
 #if defined(USE_LUA)
     LUA_PRELOAD_FILE, LUA_SCRIPT_EXTENSIONS, LUA_SERVER_PAGE_EXTENSIONS,
@@ -714,6 +746,7 @@ static struct mg_option config_options[] = {
     {"url_rewrite_patterns",        12345,                     NULL},
     {"hide_files_patterns",         12345,                     NULL},
     {"request_timeout_ms",          CONFIG_TYPE_NUMBER,        "30000"},
+    {"decode_url",                  CONFIG_TYPE_BOOLEAN,       "yes"},
 
 #if defined(USE_LUA)
     {"lua_preload_file",            CONFIG_TYPE_FILE,          NULL},
@@ -753,60 +786,64 @@ struct mg_context {
     in_port_t *listening_ports;
     int num_listening_sockets;
 
-    volatile int num_threads;  /* Number of threads */
-    pthread_mutex_t mutex;     /* Protects (max|num)_threads */
-    pthread_cond_t  cond;      /* Condvar for tracking workers terminations */
+    volatile int num_threads;       /* Number of threads */
+    pthread_mutex_t thread_mutex;   /* Protects (max|num)_threads */
+    pthread_cond_t thread_cond;     /* Condvar for tracking workers terminations */
 
     struct socket queue[MGSQLEN];   /* Accepted sockets */
-    volatile int sq_head;      /* Head of the socket queue */
-    volatile int sq_tail;      /* Tail of the socket queue */
-    pthread_cond_t sq_full;    /* Signaled when socket is produced */
-    pthread_cond_t sq_empty;   /* Signaled when socket is consumed */
-    pthread_t masterthreadid;  /* The master thread ID. */
-    int workerthreadcount;     /* The amount of worker threads. */
-    pthread_t *workerthreadids;/* The worker thread IDs. */
+    volatile int sq_head;           /* Head of the socket queue */
+    volatile int sq_tail;           /* Tail of the socket queue */
+    pthread_cond_t sq_full;         /* Signaled when socket is produced */
+    pthread_cond_t sq_empty;        /* Signaled when socket is consumed */
+    pthread_t masterthreadid;       /* The master thread ID */
+    int workerthreadcount;          /* The amount of worker threads. */
+    pthread_t *workerthreadids;     /* The worker thread IDs */
 
-    unsigned long start_time;  /* Server start time, used for authentication */
-    unsigned long nonce_count; /* Used nonces, used for authentication */
+    unsigned long start_time;       /* Server start time, used for authentication */
+    pthread_mutex_t nonce_mutex;    /* Protects nonce_count */
+    unsigned long nonce_count;      /* Used nonces, used for authentication */
 
-    char *systemName;          /* What operating system is running */
+    char *systemName;               /* What operating system is running */
 
     /* linked list of uri handlers */
     struct mg_request_handler_info *request_handlers;
 
 #if defined(USE_LUA) && defined(USE_WEBSOCKET)
     /* linked list of shared lua websockets */
-    struct mg_shared_lua_websocket *shared_lua_websockets;
+    struct mg_shared_lua_websocket_list *shared_lua_websockets;
+#endif
+
+#ifdef USE_TIMERS
+    struct timers * timers;
 #endif
 };
 
 struct mg_connection {
     struct mg_request_info request_info;
     struct mg_context *ctx;
-    SSL *ssl;                   /* SSL descriptor */
-    SSL_CTX *client_ssl_ctx;    /* SSL context for client connections */
-    struct socket client;       /* Connected client */
-    time_t birth_time;          /* Time when request was received */
-    int64_t num_bytes_sent;     /* Total bytes sent to client */
-    int64_t content_len;        /* Content-Length header value */
-    int64_t consumed_content;   /* How many bytes of content have been read */
-    char *buf;                  /* Buffer for received data */
-    char *path_info;            /* PATH_INFO part of the URL */
-    int must_close;             /* 1 if connection must be closed */
-    int in_error_handler;       /* 1 if in handler for user defined error pages */
-    int buf_size;               /* Buffer size */
-    int request_len;            /* Size of the request + headers in a buffer */
-    int data_len;               /* Total size of data in a buffer */
-    int status_code;            /* HTTP reply status code, e.g. 200 */
-    int throttle;               /* Throttling, bytes/sec. <= 0 means no
-                                   throttle */
-    time_t last_throttle_time;  /* Last time throttled data was sent */
-    int64_t last_throttle_bytes;/* Bytes sent this second */
-    pthread_mutex_t mutex;      /* Used by mg_lock/mg_unlock to ensure atomic
-                                   transmissions for websockets */
+    SSL *ssl;                       /* SSL descriptor */
+    SSL_CTX *client_ssl_ctx;        /* SSL context for client connections */
+    struct socket client;           /* Connected client */
+    time_t birth_time;              /* Time when request was received */
+    int64_t num_bytes_sent;         /* Total bytes sent to client */
+    int64_t content_len;            /* Content-Length header value */
+    int64_t consumed_content;       /* How many bytes of content have been read */
+    char *buf;                      /* Buffer for received data */
+    char *path_info;                /* PATH_INFO part of the URL */
+    int must_close;                 /* 1 if connection must be closed */
+    int in_error_handler;           /* 1 if in handler for user defined error pages */
+    int buf_size;                   /* Buffer size */
+    int request_len;                /* Size of the request + headers in a buffer */
+    int data_len;                   /* Total size of data in a buffer */
+    int status_code;                /* HTTP reply status code, e.g. 200 */
+    int throttle;                   /* Throttling, bytes/sec. <= 0 means no throttle */
+    time_t last_throttle_time;      /* Last time throttled data was sent */
+    int64_t last_throttle_bytes;    /* Bytes sent this second */
+    pthread_mutex_t mutex;          /* Used by mg_lock_connection/mg_unlock_connection to ensure atomic transmissions for websockets */
 #if defined(USE_LUA) && defined(USE_WEBSOCKET)
-    void * lua_websocket_state; /* Lua_State for a websocket connection */
+    void * lua_websocket_state;     /* Lua_State for a websocket connection */
 #endif
+    int is_chunked;                 /* transfer-encoding is chunked */
 };
 
 static pthread_key_t sTlsKey;  /* Thread local storage index */
@@ -1253,7 +1290,6 @@ static int match_prefix(const char *pattern, int pattern_len, const char *str)
     }
 
     i = j = 0;
-    res = -1;
     for (; i < pattern_len; i++, j++) {
         if (pattern[i] == '?' && str[j] != '\0') {
             continue;
@@ -1298,6 +1334,11 @@ static int should_keep_alive(const struct mg_connection *conn)
     return 1;
 }
 
+static int should_decode_url(const struct mg_connection *conn)
+{
+    return (mg_strcasecmp(conn->ctx->config[DECODE_URL], "yes") == 0);
+}
+
 static const char *suggest_connection_header(const struct mg_connection *conn)
 {
     return should_keep_alive(conn) ? "keep-alive" : "close";
@@ -1381,7 +1422,7 @@ static void send_http_error(struct mg_connection *conn, int status,
             len += mg_vsnprintf(conn, buf + len, sizeof(buf) - len, fmt, ap);
             va_end(ap);
         }
-        DEBUG_TRACE(("[%s]", buf));
+        DEBUG_TRACE("[%s]", buf);
 
         mg_printf(conn, "HTTP/1.1 %d %s\r\n"
                         "Content-Length: %d\r\n"
@@ -1408,7 +1449,18 @@ static int pthread_mutex_destroy(pthread_mutex_t *mutex)
 
 static int pthread_mutex_lock(pthread_mutex_t *mutex)
 {
-    return WaitForSingleObject(*mutex, INFINITE) == WAIT_OBJECT_0? 0 : -1;
+    return WaitForSingleObject(*mutex, INFINITE) == WAIT_OBJECT_0 ? 0 : -1;
+}
+
+static int pthread_mutex_trylock(pthread_mutex_t *mutex)
+{
+    switch (WaitForSingleObject(*mutex, 0)) {
+        case WAIT_OBJECT_0:
+            return 0;
+        case WAIT_TIMEOUT:
+            return -2; /* EBUSY */
+    }
+    return -1;
 }
 
 static int pthread_mutex_unlock(pthread_mutex_t *mutex)
@@ -1853,7 +1905,7 @@ static int mg_join_thread(pthread_t threadid)
         int err;
 
         err = GetLastError();
-        DEBUG_TRACE(("WaitForSingleObject() failed, error %d", err));
+        DEBUG_TRACE("WaitForSingleObject() failed, error %d", err);
     } else {
         if (dwevent == WAIT_OBJECT_0) {
             CloseHandle(threadid);
@@ -1959,7 +2011,7 @@ static pid_t spawn_process(struct mg_connection *conn, const char *prog,
     mg_snprintf(conn, cmdline, sizeof(cmdline), "%s%s\"%s\\%s\"",
                 interp, interp[0] == '\0' ? "" : " ", full_dir, prog);
 
-    DEBUG_TRACE(("Running [%s]", cmdline));
+    DEBUG_TRACE("Running [%s]", cmdline);
     if (CreateProcessA(NULL, cmdline, NULL, NULL, TRUE,
                        CREATE_NEW_PROCESS_GROUP, envblk, NULL, &si, &pi) == 0) {
         mg_cry(conn, "%s: CreateProcess(%s): %ld",
@@ -2002,9 +2054,9 @@ static int mg_stat(struct mg_connection *conn, const char *path,
 static void set_close_on_exec(int fd, struct mg_connection *conn /* may be null */)
 {
     if (fcntl(fd, F_SETFD, FD_CLOEXEC) != 0) {
-        if (conn)
-            mg_cry(conn, "%s: fcntl(F_SETFD FD_CLOEXEC) failed: %s",
-                   __func__, strerror(ERRNO));
+        if (conn) {
+            mg_cry(conn, "%s: fcntl(F_SETFD FD_CLOEXEC) failed: %s", __func__, strerror(ERRNO));
+        }
     }
 }
 
@@ -2031,8 +2083,7 @@ int mg_start_thread(mg_thread_func_t func, void *param)
 
 /* Start a thread storing the thread context. */
 
-static int mg_start_thread_with_id(mg_thread_func_t func, void *param,
-                                   pthread_t *threadidptr)
+static int mg_start_thread_with_id(mg_thread_func_t func, void *param, pthread_t *threadidptr)
 {
     pthread_t thread_id;
     pthread_attr_t attr;
@@ -2128,8 +2179,7 @@ static int set_non_blocking_mode(SOCKET sock)
 
 /* Write data to the IO channel - opened file descriptor, socket or SSL
    descriptor. Return number of bytes written. */
-static int64_t push(FILE *fp, SOCKET sock, SSL *ssl, const char *buf,
-                    int64_t len)
+static int64_t push(FILE *fp, SOCKET sock, SSL *ssl, const char *buf, int64_t len)
 {
     int64_t sent;
     int n, k;
@@ -2207,7 +2257,25 @@ static int pull_all(FILE *fp, struct mg_connection *conn, char *buf, int len)
     return nread;
 }
 
-int mg_read(struct mg_connection *conn, void *buf, size_t len)
+static void fast_forward_request(struct mg_connection *conn)
+{
+    char buf[MG_BUF_LEN];
+    int to_read, nread;
+
+    while (conn->consumed_content < conn->content_len) {
+        to_read = sizeof(buf);
+        if ((int64_t) to_read > conn->content_len - conn->consumed_content) {
+            to_read = (int) (conn->content_len - conn->consumed_content);
+        }
+
+	nread = mg_read(conn, buf, to_read);
+        if (nread <= 0) {
+            break;
+        }
+    }
+}
+
+int mg_read_inner(struct mg_connection *conn, void *buf, size_t len)
 {
     int64_t n, buffered_len, nread;
     const char *body;
@@ -2248,6 +2316,43 @@ int mg_read(struct mg_connection *conn, void *buf, size_t len)
     return nread;
 }
 
+static int mg_getc(struct mg_connection *conn) {
+    char c;
+    conn->content_len++;
+    if ( mg_read_inner(conn,&c,1) <= 0 ) return EOF;
+    return c;
+}
+
+int mg_read(struct mg_connection *conn, void *buf, size_t len) {
+    if ( conn->is_chunked ) {
+        if (conn->content_len <= 0 ) conn->content_len = 0;
+        if (conn->consumed_content < conn->content_len) return mg_read_inner(conn,buf,len);
+        int i = 0;
+        char str[64];
+        while (1) {
+            int c = mg_getc(conn);
+	    if (c == EOF) return EOF;
+            if ( ! ( c == '\n' || c == '\r' ) ) {
+                str[i++] = c;
+                break;
+            }
+        }
+        for (; i < (int)sizeof(str); i++) {
+            int c = mg_getc(conn);
+            if ( c == EOF ) return -1;
+            str[i] = (char) c;
+            if ( i > 0 && str[i] == '\n' && str[i-1] == '\r' ) break;
+        }
+        char *end = 0;
+        long chunkSize = strtol(str,&end,16);
+        if ( end != str+(i-1) ) return -1;
+        if ( chunkSize == 0 ) return 0;
+        conn->content_len += chunkSize;
+    }
+    return mg_read_inner(conn,buf,len);
+}
+
+
 int mg_write(struct mg_connection *conn, const void *buf, size_t len)
 {
     time_t now;
@@ -2381,7 +2486,7 @@ int mg_url_decode(const char *src, int src_len, char *dst,
 #define HEXTOI(x) (isdigit(x) ? x - '0' : x - 'W')
 
     for (i = j = 0; i < src_len && j < dst_len - 1; i++, j++) {
-        if (src[i] == '%' && i < src_len - 2 &&
+        if (i < src_len - 2 && src[i] == '%' &&
             isxdigit(* (const unsigned char *) (src + i + 1)) &&
             isxdigit(* (const unsigned char *) (src + i + 2))) {
             a = tolower(* (const unsigned char *) (src + i + 1));
@@ -2656,10 +2761,11 @@ static int get_request_len(const char *buf, int buflen)
 {
     const char *s, *e;
     int len = 0;
+    int in_content = 0;
 
-    for (s = buf, e = s + buflen - 1; len <= 0 && s < e; s++)
+    for (s = buf, e = s + buflen - 1; len <= 0 && s < e; s++) {
         /* Control characters are not allowed but >=128 is. */
-        if (!isprint(* (const unsigned char *) s) && *s != '\r' &&
+        if (!in_content && !isprint(* (const unsigned char *) s) && *s != '\r' &&
             *s != '\n' && * (const unsigned char *) s < 128) {
             len = -1;
             break;  /* [i_a] abort scan as soon as one malformed character is
@@ -2670,8 +2776,14 @@ static int get_request_len(const char *buf, int buflen)
         } else if (s[0] == '\n' && &s[1] < e &&
                    s[1] == '\r' && s[2] == '\n') {
             len = (int) (s - buf) + 3;
+	    in_content = 0;
         }
 
+	if (!in_content && *s == ':') {
+	    in_content = 1;
+	}
+    }
+
     return len;
 }
 
@@ -2724,6 +2836,7 @@ static time_t parse_date_string(const char *datetime)
     return result;
 }
 
+#ifndef RGW
 /* Protect against directory disclosure attack by removing '..',
    excessive '/' and '\' characters */
 static void remove_double_dots_and_double_slashes(char *s)
@@ -2747,6 +2860,7 @@ static void remove_double_dots_and_double_slashes(char *s)
     }
     *p = '\0';
 }
+#endif
 
 static const struct {
     const char *extension;
@@ -2976,9 +3090,11 @@ static void open_auth_file(struct mg_connection *conn, const char *path,
         }
     } else {
         /* Try to find .htpasswd in requested directory. */
-        for (p = path, e = p + strlen(p) - 1; e > p; e--)
-            if (e[0] == '/')
+        for (p = path, e = p + strlen(p) - 1; e > p; e--) {
+            if (e[0] == '/') {
                 break;
+            }
+        }
         mg_snprintf(conn, name, sizeof(name), "%.*s%c%s",
                     (int) (e - p), p, '/', PASSWORDS_FILE_NAME);
         if (!mg_fopen(conn, name, "r", filep)) {
@@ -3108,33 +3224,98 @@ static char *mg_fgets(char *buf, size_t size, struct file *filep, char **p)
     }
 }
 
-/* Authorize against the opened passwords file. Return 1 if authorized. */
-static int authorize(struct mg_connection *conn, struct file *filep)
-{
+struct read_auth_file_struct {
+    struct mg_connection *conn;
     struct ah ah;
-    char line[256], f_user[256] = "", ha1[256] = "", f_domain[256] = "", buf[MG_BUF_LEN], *p;
+    char *domain;
+    char buf[256+256+40];
+    char *f_user;
+    char *f_domain;
+    char *f_ha1;
+};
 
-    if (!parse_auth_header(conn, buf, sizeof(buf), &ah)) {
-        return 0;
-    }
+static int read_auth_file(struct file *filep, struct read_auth_file_struct * workdata)
+{
+    char *p;
+    int is_authorized = 0;
+    struct file fp;
+    int l;
 
     /* Loop over passwords file */
     p = (char *) filep->membuf;
-    while (mg_fgets(line, sizeof(line), filep, &p) != NULL) {
-        if (sscanf(line, "%255[^:]:%255[^:]:%255s", f_user, f_domain, ha1) != 3) {
+    while (mg_fgets(workdata->buf, sizeof(workdata->buf), filep, &p) != NULL) {
+
+        l = strlen(workdata->buf);
+        while (l>0) {
+            if (isspace(workdata->buf[l-1]) || iscntrl(workdata->buf[l-1])) {
+                l--;
+                workdata->buf[l] = 0;
+            } else break;
+        }
+        if (l<1) continue;
+
+        workdata->f_user = workdata->buf;
+
+        if (workdata->f_user[0]==':') {
+            /* user names may not contain a ':' and may not be empty,
+               so lines starting with ':' may be used for a special purpose */
+            if (workdata->f_user[1]=='#') {
+                /* :# is a comment */
+                continue;
+            } else if (!strncmp(workdata->f_user+1,"include=",8)) {
+                if (mg_fopen(workdata->conn, workdata->f_user+9, "r", &fp)) {
+                    is_authorized = read_auth_file(&fp, workdata);
+                    mg_fclose(&fp);
+                } else {
+                    mg_cry(workdata->conn, "%s: cannot open authorization file: %s", __func__, workdata->buf);
+                }
+                continue;
+            }
+            /* everything is invalid for the moment (might change in the future) */
+            mg_cry(workdata->conn, "%s: syntax error in authorization file: %s", __func__, workdata->buf);
             continue;
         }
-        f_user[255]=0;
-        f_domain[255]=0;
-        ha1[255]=0;
 
-        if (!strcmp(ah.user, f_user) &&
-            !strcmp(conn->ctx->config[AUTHENTICATION_DOMAIN], f_domain))
-            return check_password(conn->request_info.request_method, ha1, ah.uri,
-                                  ah.nonce, ah.nc, ah.cnonce, ah.qop, ah.response);
+        workdata->f_domain = strchr(workdata->f_user, ':');
+        if (workdata->f_domain == NULL) {
+            mg_cry(workdata->conn, "%s: syntax error in authorization file: %s", __func__, workdata->buf);
+            continue;
+        }
+        *(workdata->f_domain) = 0;
+        (workdata->f_domain)++;
+
+        workdata->f_ha1 = strchr(workdata->f_domain, ':');
+        if (workdata->f_ha1 == NULL) {
+            mg_cry(workdata->conn, "%s: syntax error in authorization file: %s", __func__, workdata->buf);
+            continue;
+        }
+        *(workdata->f_ha1) = 0;
+        (workdata->f_ha1)++;
+
+        if (!strcmp(workdata->ah.user, workdata->f_user) && !strcmp(workdata->domain, workdata->f_domain)) {
+            return check_password(workdata->conn->request_info.request_method, workdata->f_ha1, workdata->ah.uri,
+                                  workdata->ah.nonce, workdata->ah.nc, workdata->ah.cnonce, workdata->ah.qop, workdata->ah.response);
+        }
     }
 
-    return 0;
+    return is_authorized;
+}
+
+/* Authorize against the opened passwords file. Return 1 if authorized. */
+static int authorize(struct mg_connection *conn, struct file *filep)
+{
+    struct read_auth_file_struct workdata;
+    char buf[MG_BUF_LEN];
+
+    memset(&workdata,0,sizeof(workdata));
+    workdata.conn = conn;
+
+    if (!parse_auth_header(conn, buf, sizeof(buf), &workdata.ah)) {
+        return 0;
+    }
+    workdata.domain = conn->ctx->config[AUTHENTICATION_DOMAIN];
+
+    return read_auth_file(filep, &workdata);
 }
 
 /* Return 1 if request is authorised, 0 otherwise. */
@@ -3172,16 +3353,16 @@ static int check_authorization(struct mg_connection *conn, const char *path)
 
 static void send_authorization_request(struct mg_connection *conn)
 {
-    char date[64];
-    time_t curtime = time(NULL);
-    unsigned long nonce = (unsigned long)(conn->ctx->start_time);
-
-    (void)pthread_mutex_lock(&conn->ctx->mutex);
+    char date[64];
+    time_t curtime = time(NULL);
+    unsigned long nonce = (unsigned long)(conn->ctx->start_time);
+
+    (void)pthread_mutex_lock(&conn->ctx->nonce_mutex);
     nonce += conn->ctx->nonce_count;
     ++conn->ctx->nonce_count;
-    (void)pthread_mutex_unlock(&conn->ctx->mutex);
-
-    nonce ^= (unsigned long)(conn->ctx);
+    (void)pthread_mutex_unlock(&conn->ctx->nonce_mutex);
+
+    nonce ^= (unsigned long)(conn->ctx);
     conn->status_code = 401;
     conn->must_close = 1;
 
@@ -3215,8 +3396,8 @@ static int is_authorized_for_put(struct mg_connection *conn)
 int mg_modify_passwords_file(const char *fname, const char *domain,
                              const char *user, const char *pass)
 {
-    int found;
-    char line[512], u[512] = "", d[512] ="", ha1[33], tmp[PATH_MAX+1];
+    int found, i;
+    char line[512], u[512] = "", d[512] ="", ha1[33], tmp[PATH_MAX+8];
     FILE *fp, *fp2;
 
     found = 0;
@@ -3227,6 +3408,25 @@ int mg_modify_passwords_file(const char *fname, const char *domain,
         pass = NULL;
     }
 
+    /* Other arguments must not be empty */
+    if (fname == NULL || domain == NULL || user == NULL) return 0;
+
+    /* Using the given file format, user name and domain must not contain ':' */
+    if (strchr(user, ':') != NULL) return 0;
+    if (strchr(domain, ':') != NULL) return 0;
+
+    /* Do not allow control characters like newline in user name and domain.
+       Do not allow excessively long names either. */
+    for (i=0; i<255 && user[i]!=0; i++) {
+        if (iscntrl(user[i])) return 0;
+    }
+    if (user[i]) return 0;
+    for (i=0; i<255 && domain[i]!=0; i++) {
+        if (iscntrl(domain[i])) return 0;
+    }
+    if (domain[i]) return 0;
+
+    /* Create a temporary file name */
     (void) snprintf(tmp, sizeof(tmp) - 1, "%s.tmp", fname);
     tmp[sizeof(tmp) - 1] = 0;
 
@@ -3822,6 +4022,7 @@ static int parse_http_message(char *buf, int len, struct mg_request_info *ri)
 
         /* HTTP message could be either HTTP request or HTTP response, e.g.
            "GET / HTTP/1.0 ...." or  "HTTP/1.0 200 OK ..." */
+#ifndef RGW
         is_request = is_valid_http_method(ri->request_method);
         if ((is_request && memcmp(ri->http_version, "HTTP/", 5) != 0) ||
             (!is_request && memcmp(ri->request_method, "HTTP/", 5) != 0)) {
@@ -3832,6 +4033,17 @@ static int parse_http_message(char *buf, int len, struct mg_request_info *ri)
             }
             parse_http_headers(&buf, ri);
         }
+#else
+	is_request = (memcmp(ri->http_version, "HTTP/", 5) == 0);
+	if (is_request) {
+	    ri->http_version += 5;
+	}
+	if (is_request || memcmp(ri->request_method, "HTTP/", 5) == 0) {
+            parse_http_headers(&buf, ri);
+	} else {
+            request_length = -1;
+	}
+#endif
     }
     return request_length;
 }
@@ -3928,7 +4140,7 @@ static int forward_body_data(struct mg_connection *conn, FILE *fp,
     expect = mg_get_header(conn, "Expect");
     assert(fp != NULL);
 
-    if (conn->content_len == -1) {
+    if (conn->content_len == -1 && !conn->is_chunked) {
         send_http_error(conn, 411, "Length Required", "%s", "");
     } else if (expect != NULL && mg_strcasecmp(expect, "100-continue")) {
         send_http_error(conn, 417, "Expectation Failed", "%s", "");
@@ -4235,7 +4447,7 @@ static void handle_cgi_request(struct mg_connection *conn, const char *prog)
        Do not send anything back to client, until we buffer in all
        HTTP headers. */
     data_len = 0;
-    buf = mg_malloc(buflen);
+    buf = (char *)mg_malloc(buflen);
     if (buf == NULL) {
         send_http_error(conn, 500, http_500_error,
                         "Not enough memory for buffer (%u bytes)",
@@ -4343,7 +4555,7 @@ static int put_dir(struct mg_connection *conn, const char *path)
         buf[len] = '\0';
 
         /* Try to create intermediate directory */
-        DEBUG_TRACE(("mkdir(%s)", buf));
+        DEBUG_TRACE("mkdir(%s)", buf);
         if (!mg_stat(conn, buf, &file) && mg_mkdir(buf, 0755) != 0) {
             res = -1;
             break;
@@ -4362,7 +4574,7 @@ static void mkcol(struct mg_connection *conn, const char *path)
 {
     int rc, body_len;
     struct de de;
-    char date[64];
+    char date[64];
     time_t curtime = time(NULL);
 
     memset(&de.file, 0, sizeof(de.file));
@@ -4413,7 +4625,7 @@ static void put_file(struct mg_connection *conn, const char *path)
     const char *range;
     int64_t r1, r2;
     int rc;
-    char date[64];
+    char date[64];
     time_t curtime = time(NULL);
 
     conn->status_code = mg_stat(conn, path, &file) ? 200 : 201;
@@ -4687,7 +4899,7 @@ static void handle_propfind(struct mg_connection *conn, const char *path,
                             struct file *filep)
 {
     const char *depth = mg_get_header(conn, "Depth");
-    char date[64];
+    char date[64];
     time_t curtime = time(NULL);
 
     gmt_time_string(date, sizeof(date), &curtime);
@@ -4717,16 +4929,30 @@ static void handle_propfind(struct mg_connection *conn, const char *path,
     conn->num_bytes_sent += mg_printf(conn, "%s\n", "</d:multistatus>");
 }
 
-void mg_lock(struct mg_connection* conn)
+void mg_lock_connection(struct mg_connection* conn)
 {
     (void) pthread_mutex_lock(&conn->mutex);
 }
 
-void mg_unlock(struct mg_connection* conn)
+void mg_unlock_connection(struct mg_connection* conn)
 {
     (void) pthread_mutex_unlock(&conn->mutex);
 }
 
+void mg_lock_context(struct mg_context* ctx)
+{
+    (void) pthread_mutex_lock(&ctx->nonce_mutex);
+}
+
+void mg_unlock_context(struct mg_context* ctx)
+{
+    (void) pthread_mutex_unlock(&ctx->nonce_mutex);
+}
+
+#if defined(USE_TIMERS)
+#include "timer.inl"
+#endif /* USE_TIMERS */
+
 #ifdef USE_LUA
 #include "mod_lua.inl"
 #endif /* USE_LUA */
@@ -5015,9 +5241,9 @@ static void read_websocket(struct mg_connection *conn)
 
             /* Copy the mask before we shift the queue and destroy it */
             if (mask_len > 0) {
-                *(uint32_t*)mask = *(uint32_t*)(buf + header_len - mask_len);
+                memcpy(mask, buf + header_len - mask_len, sizeof(mask));
             } else {
-                *(uint32_t*)mask = 0;
+                memset(mask, 0, sizeof(mask));
             }
 
             /* Read frame payload from the first message in the queue into
@@ -5075,7 +5301,7 @@ static void read_websocket(struct mg_connection *conn)
                  !conn->ctx->callbacks.websocket_data(conn, mop, data, data_len)) ||
 #ifdef USE_LUA
                 (conn->lua_websocket_state &&
-                 !lua_websocket_data(conn, mop, data, data_len)) ||
+                 !lua_websocket_data(conn, conn->lua_websocket_state, mop, data, data_len)) ||
 #endif
                 (buf[0] & 0xf) == WEBSOCKET_OPCODE_CONNECTION_CLOSE) {  /* Opcode == 8, connection close */
                 break;
@@ -5130,10 +5356,10 @@ int mg_websocket_write(struct mg_connection* conn, int opcode, const char* data,
        but mongoose's mg_printf/mg_write is not (because of the loop in
        push(), although that is only a problem if the packet is large or
        outgoing buffer is full). */
-    (void) mg_lock(conn);
+    (void) mg_lock_connection(conn);
     retval = mg_write(conn, header, headerLen);
     retval = mg_write(conn, data, dataLen);
-    mg_unlock(conn);
+    mg_unlock_connection(conn);
 
     return retval;
 }
@@ -5142,7 +5368,7 @@ static void handle_websocket_request(struct mg_connection *conn, const char *pat
 {
     const char *version = mg_get_header(conn, "Sec-WebSocket-Version");
 #ifdef USE_LUA
-    int lua_websock, shared_lua_websock = 0;
+    int lua_websock = 0;
     /* TODO: A websocket script may be shared between several clients, allowing them to communicate
              directly instead of writing to a data base and polling the data base. */
 #endif
@@ -5155,17 +5381,17 @@ static void handle_websocket_request(struct mg_connection *conn, const char *pat
         /* The C callback is called before Lua and may prevent Lua from handling the websocket. */
     } else {
 #ifdef USE_LUA
-        lua_websock = conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS] ?
-                          match_prefix(conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS],
+        if (conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS]) {
+            lua_websock = match_prefix(conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS],
                                        (int)strlen(conn->ctx->config[LUA_WEBSOCKET_EXTENSIONS]),
-                                       path) : 0;
+                                       path);
+        }
 
-        if (lua_websock || shared_lua_websock) {
-            /* TODO */ shared_lua_websock = 0;
-            conn->lua_websocket_state = lua_websocket_new(path, conn, !!shared_lua_websock);
+        if (lua_websock) {
+            conn->lua_websocket_state = lua_websocket_new(path, conn);
             if (conn->lua_websocket_state) {
                 send_websocket_handshake(conn);
-                if (lua_websocket_ready(conn)) {
+                if (lua_websocket_ready(conn, conn->lua_websocket_state)) {
                     read_websocket(conn);
                 }
             }
@@ -5295,6 +5521,7 @@ int mg_upload(struct mg_connection *conn, const char *destination_dir)
         assert(len >= 0 && len <= (int) sizeof(buf));
         while ((n = mg_read(conn, buf + len, sizeof(buf) - len)) > 0) {
             len += n;
+            assert(len <= (int) sizeof(buf));
         }
         if ((headers_len = get_request_len(buf, len)) <= 0) {
             break;
@@ -5422,7 +5649,7 @@ static void redirect_to_https_port(struct mg_connection *conn, int ssl_index)
 
 void mg_set_request_handler(struct mg_context *ctx, const char *uri, mg_request_handler handler, void *cbdata)
 {
-    struct mg_request_handler_info *tmp_rh, *lastref = 0;
+    struct mg_request_handler_info *tmp_rh, *lastref = NULL;
     size_t urilen = strlen(uri);
 
     /* first see it the uri exists */
@@ -5507,10 +5734,10 @@ static int use_request_handler(struct mg_connection *conn)
 
             return tmp_rh->handler(conn, tmp_rh->cbdata);
         }
-
-        /* try for pattern match */
-        if (match_prefix(tmp_rh->uri, tmp_rh->uri_len, uri) > 0) {
-           return tmp_rh->handler(conn, tmp_rh->cbdata);
+
+        /* try for pattern match */
+        if (match_prefix(tmp_rh->uri, tmp_rh->uri_len, uri) > 0) {
+           return tmp_rh->handler(conn, tmp_rh->cbdata);
         }
 
     }
@@ -5528,21 +5755,28 @@ static void handle_request(struct mg_connection *conn)
     char path[PATH_MAX];
     int uri_len, ssl_index, is_script_resource;
     struct file file = STRUCT_FILE_INITIALIZER;
-    char date[64];
+    char date[64];
     time_t curtime = time(NULL);
 
     if ((conn->request_info.query_string = strchr(ri->uri, '?')) != NULL) {
         * ((char *) conn->request_info.query_string++) = '\0';
     }
     uri_len = (int) strlen(ri->uri);
-    mg_url_decode(ri->uri, uri_len, (char *) ri->uri, uri_len + 1, 0);
+
+    if (should_decode_url(conn)) {
+      mg_url_decode(ri->uri, uri_len, (char *) ri->uri, uri_len + 1, 0);
+    }
+
+#ifndef RGW
     remove_double_dots_and_double_slashes((char *) ri->uri);
+#endif
+
     path[0] = '\0';
     convert_uri_to_file_name(conn, path, sizeof(path), &file, &is_script_resource);
     conn->throttle = set_throttle(conn->ctx->config[THROTTLE],
                                   get_remote_ip(conn), ri->uri);
 
-    DEBUG_TRACE(("%s", ri->uri));
+    DEBUG_TRACE("%s", ri->uri);
     /* Perform redirect and auth checks before calling begin_request() handler.
        Otherwise, begin_request() would need to perform auth checks and
        redirects. */
@@ -5555,6 +5789,7 @@ static void handle_request(struct mg_connection *conn)
     } else if (conn->ctx->callbacks.begin_request != NULL &&
                conn->ctx->callbacks.begin_request(conn)) {
         /* Do nothing, callback has served the request */
+	fast_forward_request(conn);
 #if defined(USE_WEBSOCKET)
     } else if (is_websocket_request(conn)) {
         handle_websocket_request(conn, path, is_script_resource);
@@ -5630,7 +5865,7 @@ static void handle_file_based_request(struct mg_connection *conn, const char *pa
                             (int)strlen(conn->ctx->config[LUA_SERVER_PAGE_EXTENSIONS]),
                             path) > 0) {
         /* Lua server page: an SSI like page containing mostly plain html code plus some tags with server generated contents. */
-        handle_lsp_request(conn, path, &file, NULL);
+        handle_lsp_request(conn, path, file, NULL);
     } else if (match_prefix(conn->ctx->config[LUA_SCRIPT_EXTENSIONS],
                             (int)strlen(conn->ctx->config[LUA_SCRIPT_EXTENSIONS]),
                             path) > 0) {
@@ -5663,9 +5898,9 @@ static void close_all_listening_sockets(struct mg_context *ctx)
         ctx->listening_sockets[i].sock = INVALID_SOCKET;
     }
     mg_free(ctx->listening_sockets);
-    ctx->listening_sockets=0;
+    ctx->listening_sockets = NULL;
     mg_free(ctx->listening_ports);
-    ctx->listening_ports=0;
+    ctx->listening_ports = NULL;
 }
 
 static int is_valid_port(unsigned int port)
@@ -5695,7 +5930,7 @@ static int parse_port_string(const struct vec *vec, struct socket *so)
         so->lsa.sin.sin_addr.s_addr = htonl((a << 24) | (b << 16) | (c << 8) | d);
         so->lsa.sin.sin_port = htons((uint16_t) port);
 #if defined(USE_IPV6)
-    } else if (sscanf(vec->ptr, "[%49[^]]]:%d%n", buf, &port, &len) == 2 &&
+    } else if (sscanf(vec->ptr, "[%49[^]]]:%u%n", buf, &port, &len) == 2 &&
                inet_pton(AF_INET6, buf, &so->lsa.sin6.sin6_addr)) {
         /* IPv6 address, e.g. [3ffe:2a00:100:7031::1]:8080 */
         so->lsa.sin6.sin6_family = AF_INET6;
@@ -5776,6 +6011,7 @@ static int set_ports_option(struct mg_context *ctx)
                           sizeof(ctx->listening_ports[0]))) == NULL) {
             closesocket(so.sock);
             so.sock = INVALID_SOCKET;
+            mg_free(ptr);
             success = 0;
         }
         else {
@@ -5795,15 +6031,14 @@ static int set_ports_option(struct mg_context *ctx)
     return success;
 }
 
-static void log_header(const struct mg_connection *conn, const char *header,
-                       FILE *fp)
+static const char* header_val(const struct mg_connection *conn, const char *header)
 {
     const char *header_value;
 
     if ((header_value = mg_get_header(conn, header)) == NULL) {
-        (void) fprintf(fp, "%s", " -");
+        return "-";
     } else {
-        (void) fprintf(fp, " \"%s\"", header_value);
+        return header_value;
     }
 }
 
@@ -5814,10 +6049,15 @@ static void log_access(const struct mg_connection *conn)
     char date[64], src_addr[IP_ADDR_STR_LEN];
     struct tm *tm;
 
+    const char *referer;
+    const char *user_agent;
+
+    char buf[4096];
+
     fp = conn->ctx->config[ACCESS_LOG_FILE] == NULL ?  NULL :
          fopen(conn->ctx->config[ACCESS_LOG_FILE], "a+");
 
-    if (fp == NULL)
+    if (fp == NULL && conn->ctx->callbacks.log_message == NULL)
         return;
 
     tm = localtime(&conn->birth_time);
@@ -5829,21 +6069,30 @@ static void log_access(const struct mg_connection *conn)
     }
 
     ri = &conn->request_info;
-    flockfile(fp);
 
     sockaddr_to_string(src_addr, sizeof(src_addr), &conn->client.rsa);
-    fprintf(fp, "%s - %s [%s] \"%s %s HTTP/%s\" %d %" INT64_FMT,
+    referer = header_val(conn, "Referer");
+    user_agent = header_val(conn, "User-Agent");
+
+    snprintf(buf, sizeof(buf), "%s - %s [%s] \"%s %s HTTP/%s\" %d %" INT64_FMT " %s %s",
             src_addr, ri->remote_user == NULL ? "-" : ri->remote_user, date,
             ri->request_method ? ri->request_method : "-",
             ri->uri ? ri->uri : "-", ri->http_version,
-            conn->status_code, conn->num_bytes_sent);
-    log_header(conn, "Referer", fp);
-    log_header(conn, "User-Agent", fp);
-    fputc('\n', fp);
-    fflush(fp);
+            conn->status_code, conn->num_bytes_sent,
+	    referer, user_agent);
 
-    funlockfile(fp);
-    fclose(fp);
+    if (conn->ctx->callbacks.log_access) {
+        conn->ctx->callbacks.log_access(conn, buf);
+    }
+
+    if (fp) {
+        flockfile(fp);
+        fprintf(fp, "%s", buf);
+        fputc('\n', fp);
+        fflush(fp);
+        funlockfile(fp);
+        fclose(fp);
+    }
 }
 
 /* Verify given socket address against the ACL.
@@ -6072,6 +6321,7 @@ static void reset_per_request_attributes(struct mg_connection *conn)
     conn->num_bytes_sent = conn->consumed_content = 0;
     conn->status_code = -1;
     conn->must_close = conn->request_len = conn->throttle = 0;
+    conn->is_chunked = 0;
 }
 
 static void close_socket_gracefully(struct mg_connection *conn)
@@ -6116,7 +6366,8 @@ static void close_connection(struct mg_connection *conn)
 {
 #if defined(USE_LUA) && defined(USE_WEBSOCKET)
     if (conn->lua_websocket_state) {
-        lua_websocket_close(conn);
+        lua_websocket_close(conn, conn->lua_websocket_state);
+        conn->lua_websocket_state = NULL;
     }
 #endif
 
@@ -6124,7 +6375,7 @@ static void close_connection(struct mg_connection *conn)
     if (conn->ctx->callbacks.connection_close != NULL)
         conn->ctx->callbacks.connection_close(conn);
 
-    mg_lock(conn);
+    mg_lock_connection(conn);
 
     conn->must_close = 1;
 
@@ -6141,7 +6392,7 @@ static void close_connection(struct mg_connection *conn)
         conn->client.sock = INVALID_SOCKET;
     }
 
-    mg_unlock(conn);
+    mg_unlock_connection(conn);
 }
 
 void mg_close_connection(struct mg_connection *conn)
@@ -6156,7 +6407,7 @@ void mg_close_connection(struct mg_connection *conn)
     mg_free(conn);
 }
 
-struct mg_connection *mg_connect(const char *host, int port, int use_ssl,
+static struct mg_connection *mg_connect(const char *host, int port, int use_ssl,
                                  char *ebuf, size_t ebuf_len)
 {
     static struct mg_context fake_ctx;
@@ -6169,13 +6420,11 @@ struct mg_connection *mg_connect(const char *host, int port, int use_ssl,
                        mg_calloc(1, sizeof(*conn) + MAX_REQUEST_SIZE)) == NULL) {
         snprintf(ebuf, ebuf_len, "calloc(): %s", strerror(ERRNO));
         closesocket(sock);
-        sock = INVALID_SOCKET;
 #ifndef NO_SSL
     } else if (use_ssl && (conn->client_ssl_ctx =
                                SSL_CTX_new(SSLv23_client_method())) == NULL) {
         snprintf(ebuf, ebuf_len, "SSL_CTX_new error");
         closesocket(sock);
-        sock = INVALID_SOCKET;
         mg_free(conn);
         conn = NULL;
 #endif /* NO_SSL */
@@ -6213,11 +6462,12 @@ static int is_valid_uri(const char *uri)
     return uri[0] == '/' || (uri[0] == '*' && uri[1] == '\0');
 }
 
-static int getreq(struct mg_connection *conn, char *ebuf, size_t ebuf_len)
+static int getreq(struct mg_connection *conn, char *ebuf, size_t ebuf_len, int *err)
 {
     const char *cl;
 
     ebuf[0] = '\0';
+    *err = 0;
     reset_per_request_attributes(conn);
     conn->request_len = read_request(NULL, conn, conn->buf, conn->buf_size,
                                      &conn->data_len);
@@ -6225,16 +6475,30 @@ static int getreq(struct mg_connection *conn, char *ebuf, size_t ebuf_len)
 
     if (conn->request_len == 0 && conn->data_len == conn->buf_size) {
         snprintf(ebuf, ebuf_len, "%s", "Request Too Large");
+	*err = 400;
+	return 0;
     } else if (conn->request_len <= 0) {
         snprintf(ebuf, ebuf_len, "%s", "Client closed connection");
+	return 0;
     } else if (parse_http_message(conn->buf, conn->buf_size,
                                   &conn->request_info) <= 0) {
         snprintf(ebuf, ebuf_len, "Bad request: [%.*s]", conn->data_len, conn->buf);
+	*err = 400;
+	return 0;
     } else {
         /* Message is a valid request or response */
-        if ((cl = get_header(&conn->request_info, "Content-Length")) != NULL) {
+        if (( cl = get_header(&conn->request_info, "Transfer-encoding")) != NULL && strcmp(cl,"chunked") == 0) {
+            conn->is_chunked = 1;
+            conn->content_len = 0;
+	} else if ((cl = get_header(&conn->request_info, "Content-Length")) != NULL) {
             /* Request/response has content length set */
-            conn->content_len = strtoll(cl, NULL, 10);
+	    char *endptr;
+            conn->content_len = strtoll(cl, &endptr, 10);
+	    if (endptr == cl) {
+                snprintf(ebuf, ebuf_len, "%s", "Bad Request");
+		*err = 400;
+	        return 0;
+	    }
         } else if (!mg_strcasecmp(conn->request_info.request_method, "POST") ||
                    !mg_strcasecmp(conn->request_info.request_method, "PUT")) {
             /* POST or PUT request without content length set */
@@ -6248,7 +6512,7 @@ static int getreq(struct mg_connection *conn, char *ebuf, size_t ebuf_len)
         }
         conn->birth_time = time(NULL);
     }
-    return ebuf[0] == '\0';
+    return 1;
 }
 
 struct mg_connection *mg_download(const char *host, int port, int use_ssl,
@@ -6264,7 +6528,8 @@ struct mg_connection *mg_download(const char *host, int port, int use_ssl,
     } else if (mg_vprintf(conn, fmt, ap) <= 0) {
         snprintf(ebuf, ebuf_len, "%s", "Error sending request");
     } else {
-        getreq(conn, ebuf, ebuf_len);
+	int err;
+        getreq(conn, ebuf, ebuf_len, &err);
     }
     if (ebuf[0] != '\0' && conn != NULL) {
         mg_close_connection(conn);
@@ -6282,14 +6547,16 @@ static void process_new_connection(struct mg_connection *conn)
     char ebuf[100];
 
     keep_alive_enabled = !strcmp(conn->ctx->config[ENABLE_KEEP_ALIVE], "yes");
-    keep_alive = 0;
 
     /* Important: on new connection, reset the receiving buffer. Credit goes
        to crule42. */
     conn->data_len = 0;
     do {
-        if (!getreq(conn, ebuf, sizeof(ebuf))) {
-            send_http_error(conn, 500, "Server Error", "%s", ebuf);
+	int err;
+        if (!getreq(conn, ebuf, sizeof(ebuf), &err)) {
+            if (err > 0) {
+              send_http_error(conn, err, "Bad Request", "%s", ebuf);
+	    }
             conn->must_close = 1;
         } else if (!is_valid_uri(conn->request_info.uri)) {
             snprintf(ebuf, sizeof(ebuf), "Invalid URI: [%s]", ri->uri);
@@ -6336,12 +6603,12 @@ static void process_new_connection(struct mg_connection *conn)
 /* Worker threads take accepted socket from the queue */
 static int consume_socket(struct mg_context *ctx, struct socket *sp)
 {
-    (void) pthread_mutex_lock(&ctx->mutex);
-    DEBUG_TRACE(("going idle"));
+    (void) pthread_mutex_lock(&ctx->thread_mutex);
+    DEBUG_TRACE("going idle");
 
     /* If the queue is empty, wait. We're idle at this point. */
     while (ctx->sq_head == ctx->sq_tail && ctx->stop_flag == 0) {
-        pthread_cond_wait(&ctx->sq_full, &ctx->mutex);
+        pthread_cond_wait(&ctx->sq_full, &ctx->thread_mutex);
     }
 
     /* If we're stopping, sq_head may be equal to sq_tail. */
@@ -6349,7 +6616,7 @@ static int consume_socket(struct mg_context *ctx, struct socket *sp)
         /* Copy socket from the queue and increment tail */
         *sp = ctx->queue[ctx->sq_tail % ARRAY_SIZE(ctx->queue)];
         ctx->sq_tail++;
-        DEBUG_TRACE(("grabbed socket %d, going busy", sp->sock));
+        DEBUG_TRACE("grabbed socket %d, going busy", sp->sock);
 
         /* Wrap pointers if needed */
         while (ctx->sq_tail > (int) ARRAY_SIZE(ctx->queue)) {
@@ -6359,7 +6626,7 @@ static int consume_socket(struct mg_context *ctx, struct socket *sp)
     }
 
     (void) pthread_cond_signal(&ctx->sq_empty);
-    (void) pthread_mutex_unlock(&ctx->mutex);
+    (void) pthread_mutex_unlock(&ctx->thread_mutex);
 
     return !ctx->stop_flag;
 }
@@ -6417,19 +6684,19 @@ static void *worker_thread_run(void *thread_func_param)
     }
 
     /* Signal master that we're done with connection and exiting */
-    (void) pthread_mutex_lock(&ctx->mutex);
+    (void) pthread_mutex_lock(&ctx->thread_mutex);
     ctx->num_threads--;
-    (void) pthread_cond_signal(&ctx->cond);
+    (void) pthread_cond_signal(&ctx->thread_cond);
     assert(ctx->num_threads >= 0);
-    (void) pthread_mutex_unlock(&ctx->mutex);
+    (void) pthread_mutex_unlock(&ctx->thread_mutex);
 
-    pthread_setspecific(sTlsKey, 0);
+    pthread_setspecific(sTlsKey, NULL);
 #if defined(_WIN32) && !defined(__SYMBIAN32__)
     CloseHandle(tls.pthread_cond_helper_mutex);
 #endif
     mg_free(conn);
 
-    DEBUG_TRACE(("exiting"));
+    DEBUG_TRACE("exiting");
     return NULL;
 }
 
@@ -6452,23 +6719,23 @@ static void *worker_thread(void *thread_func_param)
 /* Master thread adds accepted socket to a queue */
 static void produce_socket(struct mg_context *ctx, const struct socket *sp)
 {
-    (void) pthread_mutex_lock(&ctx->mutex);
+    (void) pthread_mutex_lock(&ctx->thread_mutex);
 
     /* If the queue is full, wait */
     while (ctx->stop_flag == 0 &&
            ctx->sq_head - ctx->sq_tail >= (int) ARRAY_SIZE(ctx->queue)) {
-        (void) pthread_cond_wait(&ctx->sq_empty, &ctx->mutex);
+        (void) pthread_cond_wait(&ctx->sq_empty, &ctx->thread_mutex);
     }
 
     if (ctx->sq_head - ctx->sq_tail < (int) ARRAY_SIZE(ctx->queue)) {
         /* Copy socket to the queue and increment head */
         ctx->queue[ctx->sq_head % ARRAY_SIZE(ctx->queue)] = *sp;
         ctx->sq_head++;
-        DEBUG_TRACE(("queued socket %d", sp->sock));
+        DEBUG_TRACE("queued socket %d", sp->sock);
     }
 
     (void) pthread_cond_signal(&ctx->sq_full);
-    (void) pthread_mutex_unlock(&ctx->mutex);
+    (void) pthread_mutex_unlock(&ctx->thread_mutex);
 }
 
 static int set_sock_timeout(SOCKET sock, int milliseconds)
@@ -6500,7 +6767,7 @@ static void accept_new_connection(const struct socket *listener,
         so.sock = INVALID_SOCKET;
     } else {
         /* Put so socket structure into the queue */
-        DEBUG_TRACE(("Accepted socket %d", (int) so.sock));
+        DEBUG_TRACE("Accepted socket %d", (int) so.sock);
         set_close_on_exec(so.sock, fc(ctx));
         so.is_ssl = listener->is_ssl;
         so.ssl_redir = listener->ssl_redir;
@@ -6581,7 +6848,7 @@ static void master_thread_run(void *thread_func_param)
         }
     }
     mg_free(pfd);
-    DEBUG_TRACE(("stopping workers"));
+    DEBUG_TRACE("stopping workers");
 
     /* Stop signal received: somebody called mg_stop. Quit. */
     close_all_listening_sockets(ctx);
@@ -6590,11 +6857,11 @@ static void master_thread_run(void *thread_func_param)
     pthread_cond_broadcast(&ctx->sq_full);
 
     /* Wait until all threads finish */
-    (void) pthread_mutex_lock(&ctx->mutex);
+    (void) pthread_mutex_lock(&ctx->thread_mutex);
     while (ctx->num_threads > 0) {
-        (void) pthread_cond_wait(&ctx->cond, &ctx->mutex);
+        (void) pthread_cond_wait(&ctx->thread_cond, &ctx->thread_mutex);
     }
-    (void) pthread_mutex_unlock(&ctx->mutex);
+    (void) pthread_mutex_unlock(&ctx->thread_mutex);
 
     /* Join all worker threads to avoid leaking threads. */
     workerthreadcount = ctx->workerthreadcount;
@@ -6605,12 +6872,12 @@ static void master_thread_run(void *thread_func_param)
 #if !defined(NO_SSL)
     uninitialize_ssl(ctx);
 #endif
-    DEBUG_TRACE(("exiting"));
+    DEBUG_TRACE("exiting");
 
 #if defined(_WIN32) && !defined(__SYMBIAN32__)
     CloseHandle(tls.pthread_cond_helper_mutex);
 #endif
-    pthread_setspecific(sTlsKey, 0);
+    pthread_setspecific(sTlsKey, NULL);
 
     /* Signal mg_stop() that we're done.
        WARNING: This must be the very last thing this
@@ -6619,7 +6886,6 @@ static void master_thread_run(void *thread_func_param)
 }
 
 /* Threads have different return types on Windows and Unix. */
-
 #ifdef _WIN32
 static unsigned __stdcall master_thread(void *thread_func_param)
 {
@@ -6642,12 +6908,19 @@ static void free_context(struct mg_context *ctx)
     if (ctx == NULL)
         return;
 
-    /* All threads exited, no sync is needed. Destroy mutex and condvars */
-    (void) pthread_mutex_destroy(&ctx->mutex);
-    (void) pthread_cond_destroy(&ctx->cond);
+    /* All threads exited, no sync is needed. Destroy thread mutex and condvars */
+    (void) pthread_mutex_destroy(&ctx->thread_mutex);
+    (void) pthread_cond_destroy(&ctx->thread_cond);
     (void) pthread_cond_destroy(&ctx->sq_empty);
     (void) pthread_cond_destroy(&ctx->sq_full);
 
+    /* Destroy other context global data structures mutex */
+    (void) pthread_mutex_destroy(&ctx->nonce_mutex);
+
+#if defined(USE_TIMERS)
+    timers_exit(ctx);
+#endif
+
     /* Deallocate config parameters */
     for (i = 0; i < NUM_OPTIONS; i++) {
         if (ctx->config[i] != NULL)
@@ -6710,20 +6983,20 @@ void mg_stop(struct mg_context *ctx)
 #endif /* _WIN32 && !__SYMBIAN32__ */
 }
 
-void get_system_name(char **sysName)
+static void get_system_name(char **sysName)
 {
 #if defined(_WIN32)
 #if !defined(__SYMBIAN32__)
     char name[128];
-    DWORD dwVersion = 0;
-    DWORD dwMajorVersion = 0;
-    DWORD dwMinorVersion = 0;
+    DWORD dwVersion = 0;
+    DWORD dwMajorVersion = 0;
+    DWORD dwMinorVersion = 0;
     DWORD dwBuild = 0;
 
-    dwVersion = GetVersion();
-
-    dwMajorVersion = (DWORD)(LOBYTE(LOWORD(dwVersion)));
-    dwMinorVersion = (DWORD)(HIBYTE(LOWORD(dwVersion)));
+    dwVersion = GetVersion();
+
+    dwMajorVersion = (DWORD)(LOBYTE(LOWORD(dwVersion)));
+    dwMinorVersion = (DWORD)(HIBYTE(LOWORD(dwVersion)));
     dwBuild = ((dwVersion < 0x80000000) ? (DWORD)(HIWORD(dwVersion)) : 0);
 
     sprintf(name, "Windows %d.%d", dwMajorVersion, dwMinorVersion);
@@ -6745,7 +7018,7 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
 {
     struct mg_context *ctx;
     const char *name, *value, *default_value;
-    int i;
+    int i, ok;
     int workerthreadcount;
 
 #if defined(_WIN32) && !defined(__SYMBIAN32__)
@@ -6767,6 +7040,7 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
 
     if (sTlsInit==0) {
         if (0 != pthread_key_create(&sTlsKey, NULL)) {
+            /* Fatal error - abort start. However, this situation should never occur in practice. */
             mg_cry(fc(ctx), "Cannot initialize thread local storage");
             mg_free(ctx);
             return NULL;
@@ -6774,11 +7048,23 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
         sTlsInit++;
     }
 
+    ok =  0==pthread_mutex_init(&ctx->thread_mutex, NULL);
+    ok &= 0==pthread_cond_init(&ctx->thread_cond, NULL);
+    ok &= 0==pthread_cond_init(&ctx->sq_empty, NULL);
+    ok &= 0==pthread_cond_init(&ctx->sq_full, NULL);
+    ok &= 0==pthread_mutex_init(&ctx->nonce_mutex, NULL);
+    if (!ok) {
+        /* Fatal error - abort start. However, this situation should never occur in practice. */
+        mg_cry(fc(ctx), "Cannot initialize thread synchronization objects");
+        mg_free(ctx);
+        return NULL;
+    }
+
     if (callbacks) {
         ctx->callbacks = *callbacks;
     }
     ctx->user_data = user_data;
-    ctx->request_handlers = 0;
+    ctx->request_handlers = NULL;
 
 #if defined(USE_LUA) && defined(USE_WEBSOCKET)
     ctx->shared_lua_websockets = 0;
@@ -6799,7 +7085,7 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
             mg_free(ctx->config[i]);
         }
         ctx->config[i] = mg_strdup(value);
-        DEBUG_TRACE(("[%s] -> [%s]", name, value));
+        DEBUG_TRACE("[%s] -> [%s]", name, value);
     }
 
     /* Set default value if needed */
@@ -6833,11 +7119,6 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
     (void) signal(SIGPIPE, SIG_IGN);
 #endif /* !_WIN32 && !__SYMBIAN32__ */
 
-    (void) pthread_mutex_init(&ctx->mutex, NULL);
-    (void) pthread_cond_init(&ctx->cond, NULL);
-    (void) pthread_cond_init(&ctx->sq_empty, NULL);
-    (void) pthread_cond_init(&ctx->sq_full, NULL);
-
     workerthreadcount = atoi(ctx->config[NUM_THREADS]);
 
     if (workerthreadcount > MAX_WORKER_THREADS) {
@@ -6848,7 +7129,7 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
 
     if (workerthreadcount > 0) {
         ctx->workerthreadcount = workerthreadcount;
-        ctx->workerthreadids = mg_calloc(workerthreadcount, sizeof(pthread_t));
+        ctx->workerthreadids = (pthread_t *)mg_calloc(workerthreadcount, sizeof(pthread_t));
         if (ctx->workerthreadids == NULL) {
             mg_cry(fc(ctx), "Not enough memory for worker thread ID array");
             free_context(ctx);
@@ -6856,19 +7137,27 @@ struct mg_context *mg_start(const struct mg_callbacks *callbacks,
         }
     }
 
+#if defined(USE_TIMERS)
+    if (timers_init(ctx) != 0) {
+        mg_cry(fc(ctx), "Error creating timers");
+        free_context(ctx);
+        return NULL;
+    }
+#endif
+
     /* Start master (listening) thread */
     mg_start_thread_with_id(master_thread, ctx, &ctx->masterthreadid);
 
     /* Start worker threads */
     for (i = 0; i < workerthreadcount; i++) {
-        (void) pthread_mutex_lock(&ctx->mutex);
+        (void) pthread_mutex_lock(&ctx->thread_mutex);
         ctx->num_threads++;
-        (void) pthread_mutex_unlock(&ctx->mutex);
+        (void) pthread_mutex_unlock(&ctx->thread_mutex);
         if (mg_start_thread_with_id(worker_thread, ctx,
                                     &ctx->workerthreadids[i]) != 0) {
-            (void) pthread_mutex_lock(&ctx->mutex);
+            (void) pthread_mutex_lock(&ctx->thread_mutex);
             ctx->num_threads--;
-            (void) pthread_mutex_unlock(&ctx->mutex);
+            (void) pthread_mutex_unlock(&ctx->thread_mutex);
             mg_cry(fc(ctx), "Cannot start worker thread: %ld", (long) ERRNO);
         }
     }
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 47d1c1d..6f57225 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -53,7 +53,7 @@ using namespace std;
 
 #include "mon/MonClient.h"
 
-#include "mds/MDSMap.h"
+#include "mds/flock.h"
 #include "osd/OSDMap.h"
 #include "mon/MonMap.h"
 
@@ -149,6 +149,7 @@ Client::Client(Messenger *m, MonClient *mc)
     logger(NULL),
     m_command_hook(this),
     timer(m->cct, client_lock),
+    switch_interrupt_cb(NULL),
     ino_invalidate_cb(NULL),
     ino_invalidate_cb_handle(NULL),
     dentry_invalidate_cb(NULL),
@@ -157,6 +158,7 @@ Client::Client(Messenger *m, MonClient *mc)
     getgroups_cb_handle(NULL),
     async_ino_invalidator(m->cct),
     async_dentry_invalidator(m->cct),
+    interrupt_finisher(m->cct),
     tick_event(NULL),
     monclient(mc), messenger(m), whoami(m->get_myname().num()),
     initialized(false), mounted(false), unmounting(false),
@@ -443,6 +445,12 @@ void Client::shutdown()
     async_dentry_invalidator.stop();
   }
 
+  if (switch_interrupt_cb) {
+    ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
+    interrupt_finisher.wait_for_empty();
+    interrupt_finisher.stop();
+  }
+
   objectcacher->stop();  // outside of client_lock! this does a join.
 
   client_lock.Lock();
@@ -1437,15 +1445,15 @@ int Client::make_request(MetaRequest *request,
 
 void Client::put_request(MetaRequest *request)
 {
-  if (request->get_num_ref() == 1) {
+  if (request->_put()) {
     if (request->inode())
       put_inode(request->take_inode());
     if (request->old_inode())
       put_inode(request->take_old_inode());
     if (request->other_inode())
       put_inode(request->take_other_inode());
+    delete request;
   }
-  request->_put();
 }
 
 int Client::encode_inode_release(Inode *in, MetaRequest *req,
@@ -2034,6 +2042,9 @@ void Client::send_reconnect(MetaSession *session)
       in->make_long_path(path);
       ldout(cct, 10) << "    path " << path << dendl;
 
+      bufferlist flockbl;
+      _encode_filelocks(in, flockbl);
+
       in->caps[mds]->seq = 0;  // reset seq.
       in->caps[mds]->issue_seq = 0;  // reset seq.
       in->caps[mds]->mseq = 0;  // reset seq.
@@ -2042,7 +2053,8 @@ void Client::send_reconnect(MetaSession *session)
 		 path.get_ino(), path.get_path(),   // ino
 		 in->caps_wanted(), // wanted
 		 in->caps[mds]->issued,     // issued
-		 in->snaprealm->ino);
+		 in->snaprealm->ino,
+		 flockbl);
 
       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
 	ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
@@ -2182,6 +2194,8 @@ void Client::put_inode(Inode *in, int n)
     in->snaprealm_item.remove_myself();
     if (in == root)
       root = 0;
+    delete in->fcntl_locks;
+    delete in->flock_locks;
     delete in;
   }
 }
@@ -3659,9 +3673,10 @@ void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MCl
   int mds = session->mds_num;
   int dirty = m->get_dirty();
   int cleaned = 0;
+  uint16_t flush_ack_tid = static_cast<uint16_t>(m->get_client_tid());
   for (int i = 0; i < CEPH_CAP_BITS; ++i) {
     if ((dirty & (1 << i)) &&
-	(m->get_client_tid() == in->flushing_cap_tid[i]))
+	(flush_ack_tid == in->flushing_cap_tid[i]))
       cleaned |= 1 << i;
   }
 
@@ -5869,6 +5884,8 @@ int Client::_release_fh(Fh *f)
     in->snap_cap_refs--;
   }
 
+  _release_filelocks(f);
+
   put_inode(in);
   delete f;
 
@@ -6857,6 +6874,290 @@ int Client::statfs(const char *path, struct statvfs *stbuf)
   return rval;
 }
 
+int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
+			 struct flock *fl, uint64_t owner, void *fuse_req)
+{
+  ldout(cct, 10) << "_do_filelock ino " << in->ino
+		 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
+		 << " type " << fl->l_type << " owner " << owner
+		 << " " << fl->l_start << "~" << fl->l_len << dendl;
+
+  int lock_cmd;
+  if (F_RDLCK == fl->l_type)
+    lock_cmd = CEPH_LOCK_SHARED;
+  else if (F_WRLCK == fl->l_type)
+    lock_cmd = CEPH_LOCK_EXCL;
+  else if (F_UNLCK == fl->l_type)
+    lock_cmd = CEPH_LOCK_UNLOCK;
+  else
+    return -EIO;
+
+  if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
+    sleep = 0;
+
+  /*
+   * Set the most significant bit, so that MDS knows the 'owner'
+   * is sufficient to identify the owner of lock. (old code uses
+   * both 'owner' and 'pid')
+   */
+  owner |= (1ULL << 63);
+
+  MetaRequest *req = new MetaRequest(op);
+  filepath path;
+  in->make_nosnap_relative_path(path);
+  req->set_filepath(path);
+  req->set_inode(in);
+
+  req->head.args.filelock_change.rule = lock_type;
+  req->head.args.filelock_change.type = lock_cmd;
+  req->head.args.filelock_change.owner = owner;
+  req->head.args.filelock_change.pid = fl->l_pid;
+  req->head.args.filelock_change.start = fl->l_start;
+  req->head.args.filelock_change.length = fl->l_len;
+  req->head.args.filelock_change.wait = sleep;
+
+  int ret;
+  bufferlist bl;
+
+  if (sleep && switch_interrupt_cb && fuse_req) {
+    // enable interrupt
+    switch_interrupt_cb(fuse_req, req->get());
+
+    ret = make_request(req, -1, -1, NULL, NULL, -1, &bl);
+
+    // disable interrupt
+    switch_interrupt_cb(fuse_req, NULL);
+    put_request(req);
+  } else {
+    ret = make_request(req, -1, -1, NULL, NULL, -1, &bl);
+  }
+
+  if (ret == 0) {
+    if (op == CEPH_MDS_OP_GETFILELOCK) {
+      ceph_filelock filelock;
+      bufferlist::iterator p = bl.begin();
+      ::decode(filelock, p);
+
+      if (CEPH_LOCK_SHARED == filelock.type)
+	fl->l_type = F_RDLCK;
+      else if (CEPH_LOCK_EXCL == filelock.type)
+	fl->l_type = F_WRLCK;
+      else
+	fl->l_type = F_UNLCK;
+
+      fl->l_whence = SEEK_SET;
+      fl->l_start = filelock.start;
+      fl->l_len = filelock.length;
+      fl->l_pid = filelock.pid;
+    } else if (op == CEPH_MDS_OP_SETFILELOCK) {
+      ceph_lock_state_t *lock_state;
+      if (lock_type == CEPH_LOCK_FCNTL) {
+	if (!in->fcntl_locks)
+	  in->fcntl_locks = new ceph_lock_state_t(cct);
+	lock_state = in->fcntl_locks;
+      } else if (lock_type == CEPH_LOCK_FLOCK) {
+	if (!in->flock_locks)
+	  in->flock_locks = new ceph_lock_state_t(cct);
+	lock_state = in->flock_locks;
+      } else
+	assert(0);
+      _update_lock_state(fl, owner, lock_state);
+
+      if (fh) {
+	if (lock_type == CEPH_LOCK_FCNTL) {
+	  if (!fh->fcntl_locks)
+	    fh->fcntl_locks = new ceph_lock_state_t(cct);
+	  lock_state = fh->fcntl_locks;
+	} else {
+	  if (!fh->flock_locks)
+	    fh->flock_locks = new ceph_lock_state_t(cct);
+	  lock_state = fh->flock_locks;
+	}
+	_update_lock_state(fl, owner, lock_state);
+      }
+    } else
+      assert(0);
+  }
+  return ret;
+}
+
+int Client::_interrupt_filelock(MetaRequest *req)
+{
+  Inode *in = req->inode();
+
+  int lock_type;
+  if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
+    lock_type = CEPH_LOCK_FLOCK_INTR;
+  else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
+    lock_type = CEPH_LOCK_FCNTL_INTR;
+  else
+    assert(0);
+
+  MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
+  filepath path;
+  in->make_nosnap_relative_path(path);
+  intr_req->set_filepath(path);
+  intr_req->set_inode(in);
+  intr_req->head.args.filelock_change = req->head.args.filelock_change;
+  intr_req->head.args.filelock_change.rule = lock_type;
+  intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
+
+  return make_request(intr_req, -1, -1, NULL, NULL, -1);
+}
+
+void Client::_encode_filelocks(Inode *in, bufferlist& bl)
+{
+  if (!in->fcntl_locks && !in->flock_locks)
+    return;
+
+  unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
+  ::encode(nr_fcntl_locks, bl);
+  if (nr_fcntl_locks) {
+    ceph_lock_state_t* lock_state = in->fcntl_locks;
+    for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
+	p != lock_state->held_locks.end();
+	++p)
+      ::encode(p->second, bl);
+  }
+
+  unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
+  ::encode(nr_flock_locks, bl);
+  if (nr_flock_locks) {
+    ceph_lock_state_t* lock_state = in->flock_locks;
+    for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
+	p != lock_state->held_locks.end();
+	++p)
+      ::encode(p->second, bl);
+  }
+
+  ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
+		 << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
+}
+
+void Client::_release_filelocks(Fh *fh)
+{
+  if (!fh->fcntl_locks && !fh->flock_locks)
+    return;
+
+  Inode *in = fh->inode;
+  ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
+
+  list<pair<int, ceph_filelock> > to_release;
+
+  if (fh->fcntl_locks) {
+    ceph_lock_state_t* lock_state = fh->fcntl_locks;
+    for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
+	p != lock_state->held_locks.end();
+	++p)
+      to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
+    delete fh->fcntl_locks;
+  }
+  if (fh->flock_locks) {
+    ceph_lock_state_t* lock_state = fh->flock_locks;
+    for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
+	p != lock_state->held_locks.end();
+	++p)
+      to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
+    delete fh->flock_locks;
+  }
+
+  if (to_release.empty())
+    return;
+
+  struct flock fl;
+  memset(&fl, 0, sizeof(fl));
+  fl.l_whence = SEEK_SET;
+  fl.l_type = F_UNLCK;
+
+  for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
+       p != to_release.end();
+       ++p) {
+    fl.l_start = p->second.start;
+    fl.l_len = p->second.length;
+    fl.l_pid = p->second.pid;
+    _do_filelock(in, NULL, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl, p->second.owner);
+  }
+}
+
+void Client::_update_lock_state(struct flock *fl, uint64_t owner,
+				ceph_lock_state_t *lock_state)
+{
+  int lock_cmd;
+  if (F_RDLCK == fl->l_type)
+    lock_cmd = CEPH_LOCK_SHARED;
+  else if (F_WRLCK == fl->l_type)
+    lock_cmd = CEPH_LOCK_EXCL;
+  else
+    lock_cmd = CEPH_LOCK_UNLOCK;;
+
+  ceph_filelock filelock;
+  filelock.start = fl->l_start;
+  filelock.length = fl->l_len;
+  filelock.client = 0;
+  // see comment in _do_filelock()
+  filelock.owner = owner | (1ULL << 63);
+  filelock.pid = fl->l_pid;
+  filelock.type = lock_cmd;
+
+  if (filelock.type == CEPH_LOCK_UNLOCK) {
+    list<ceph_filelock> activated_locks;
+    lock_state->remove_lock(filelock, activated_locks);
+  } else {
+    bool r = lock_state->add_lock(filelock, false, false);
+    assert(r);
+  }
+}
+
+int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
+{
+  Inode *in = fh->inode;
+  ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
+  int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
+  return ret;
+}
+
+int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req)
+{
+  Inode *in = fh->inode;
+  ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
+  int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner, fuse_req);
+  ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
+  return ret;
+}
+
+int Client::_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req)
+{
+  Inode *in = fh->inode;
+  ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
+
+  int sleep = !(cmd & LOCK_NB);
+  cmd &= ~LOCK_NB;
+
+  int type;
+  switch (cmd) {
+    case LOCK_SH:
+      type = F_RDLCK;
+      break;
+    case LOCK_EX:
+      type = F_WRLCK;
+      break;
+    case LOCK_UN:
+      type = F_UNLCK;
+      break;
+    default:
+      return -EINVAL;
+  }
+
+  struct flock fl;
+  memset(&fl, 0, sizeof(fl));
+  fl.l_type = type;
+  fl.l_whence = SEEK_SET;
+
+  int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner, fuse_req);
+  ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
+  return ret;
+}
+
 int Client::ll_statfs(Inode *in, struct statvfs *stbuf)
 {
   /* Since the only thing this does is wrap a call to statfs, and
@@ -6887,6 +7188,16 @@ void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void
   async_dentry_invalidator.start();
 }
 
+void Client::ll_register_switch_interrupt_cb(client_switch_interrupt_callback_t cb)
+{
+  Mutex::Locker l(client_lock);
+  ldout(cct, 10) << "ll_register_switch_interrupt_cb cb " << (void*)cb << dendl;
+  if (cb == NULL)
+    return;
+  switch_interrupt_cb = cb;
+  interrupt_finisher.start();
+}
+
 void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle)
 {
   Mutex::Locker l(client_lock);
@@ -8739,6 +9050,59 @@ int Client::ll_release(Fh *fh)
   return 0;
 }
 
+int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
+{
+  Mutex::Locker lock(client_lock);
+
+  ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
+  tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
+
+  return _getlk(fh, fl, owner);
+}
+
+int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req)
+{
+  Mutex::Locker lock(client_lock);
+
+  ldout(cct, 3) << "ll_setlk  (fh) " << fh << " " << fh->inode->ino << dendl;
+  tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
+
+  return _setlk(fh, fl, owner, sleep, fuse_req);
+}
+
+int Client::ll_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req)
+{
+  Mutex::Locker lock(client_lock);
+
+  ldout(cct, 3) << "ll_flock  (fh) " << fh << " " << fh->inode->ino << dendl;
+  tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
+
+  return _flock(fh, cmd, owner, fuse_req);
+}
+
+class C_Client_RequestInterrupt : public Context  {
+private:
+  Client *client;
+  MetaRequest *req;
+public:
+  C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
+    req->get();
+  }
+  void finish(int r) {
+    Mutex::Locker l(client->client_lock);
+    assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
+    client->_interrupt_filelock(req);
+    client->put_request(req);
+  }
+};
+
+void Client::ll_interrupt(void *d)
+{
+  MetaRequest *req = static_cast<MetaRequest*>(d);
+  ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
+  tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
+  interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
+}
 
 // =========================================
 // layout
diff --git a/src/client/Client.h b/src/client/Client.h
index e31e90a..8e1741d 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -119,6 +119,7 @@ struct CapSnap;
 
 struct MetaSession;
 struct MetaRequest;
+class ceph_lock_state_t;
 
 
 typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len);
@@ -127,6 +128,7 @@ typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
 					 vinodeno_t ino, string& name);
 
 typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
+typedef void(*client_switch_interrupt_callback_t)(void *req, void *data);
 
 // ========================================================
 // client interface
@@ -214,6 +216,8 @@ class Client : public Dispatcher {
 
   SafeTimer timer;
 
+  client_switch_interrupt_callback_t switch_interrupt_cb;
+
   client_ino_callback_t ino_invalidate_cb;
   void *ino_invalidate_cb_handle;
 
@@ -225,6 +229,7 @@ class Client : public Dispatcher {
 
   Finisher async_ino_invalidator;
   Finisher async_dentry_invalidator;
+  Finisher interrupt_finisher;
 
   Context *tick_event;
   utime_t last_cap_renew;
@@ -374,6 +379,7 @@ protected:
   friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
   friend class C_Client_DentryInvalidate;  // calls dentry_invalidate_cb
   friend class C_Block_Sync; // Calls block map and protected helpers
+  friend class C_Client_RequestInterrupt;
 
   //int get_cache_size() { return lru.lru_get_size(); }
   //void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -604,6 +610,9 @@ private:
   int _fsync(Fh *fh, bool syncdataonly);
   int _sync_fs();
   int _fallocate(Fh *fh, int mode, int64_t offset, int64_t length);
+  int _getlk(Fh *fh, struct flock *fl, uint64_t owner);
+  int _setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req=NULL);
+  int _flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req=NULL);
 
   int get_or_create(Inode *dir, const char* name,
 		    Dentry **pdn, bool expect_null=false);
@@ -613,6 +622,12 @@ private:
   vinodeno_t _get_vino(Inode *in);
   inodeno_t _get_inodeno(Inode *in);
 
+  int _do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
+		   struct flock *fl, uint64_t owner, void *fuse_req=NULL);
+  int _interrupt_filelock(MetaRequest *req);
+  void _encode_filelocks(Inode *in, bufferlist& bl);
+  void _release_filelocks(Fh *fh);
+  void _update_lock_state(struct flock *fl, uint64_t owner, ceph_lock_state_t *lock_state);
 public:
   int mount(const std::string &mount_root);
   void unmount();
@@ -818,6 +833,10 @@ public:
   int ll_fsync(Fh *fh, bool syncdataonly);
   int ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length);
   int ll_release(Fh *fh);
+  int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner);
+  int ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req);
+  int ll_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req);
+  void ll_interrupt(void *d);
   int ll_get_stripe_osd(struct Inode *in, uint64_t blockno,
 			ceph_file_layout* layout);
   uint64_t ll_get_internal_offset(struct Inode *in, uint64_t blockno);
@@ -825,11 +844,11 @@ public:
   int ll_num_osds(void);
   int ll_osdaddr(int osd, uint32_t *addr);
   int ll_osdaddr(int osd, char* buf, size_t size);
-  void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
 
+  void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
   void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle);
-
   void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle);
+  void ll_register_switch_interrupt_cb(client_switch_interrupt_callback_t cb);
 };
 
 #endif
diff --git a/src/client/Fh.h b/src/client/Fh.h
index 083ccd1..237a6d8 100644
--- a/src/client/Fh.h
+++ b/src/client/Fh.h
@@ -5,6 +5,7 @@
 
 class Inode;
 class Cond;
+class ceph_lock_state_t;
 
 // file handle for any open file state
 
@@ -23,8 +24,13 @@ struct Fh {
   loff_t consec_read_bytes;
   int nr_consec_read;
 
+  // file lock
+  ceph_lock_state_t *fcntl_locks;
+  ceph_lock_state_t *flock_locks;
+
   Fh() : inode(0), pos(0), mds(0), mode(0), flags(0), pos_locked(false),
-	 last_pos(0), consec_read_bytes(0), nr_consec_read(0) {}
+	 last_pos(0), consec_read_bytes(0), nr_consec_read(0),
+	 fcntl_locks(NULL), flock_locks(NULL)  {}
 };
 
 
diff --git a/src/client/Inode.h b/src/client/Inode.h
index 221a91a..91ba2fc 100644
--- a/src/client/Inode.h
+++ b/src/client/Inode.h
@@ -17,7 +17,8 @@ struct MetaSession;
 class Dentry;
 class Dir;
 struct SnapRealm;
-class Inode;
+struct Inode;
+class ceph_lock_state_t;
 
 struct Cap {
   MetaSession *session;
@@ -210,6 +211,10 @@ class Inode {
     ll_ref -= n;
   }
 
+  // file locks
+  ceph_lock_state_t *fcntl_locks;
+  ceph_lock_state_t *flock_locks;
+
   Inode(CephContext *cct_, vinodeno_t vino, ceph_file_layout *newlayout)
     : cct(cct_), ino(vino.ino), snapid(vino.snapid),
       rdev(0), mode(0), uid(0), gid(0), nlink(0),
@@ -224,8 +229,8 @@ class Inode {
       snaprealm(0), snaprealm_item(this), snapdir_parent(0),
       oset((void *)this, newlayout->fl_pg_pool, ino),
       reported_size(0), wanted_max_size(0), requested_max_size(0),
-      _ref(0), ll_ref(0), 
-      dir(0), dn_set()
+      _ref(0), ll_ref(0), dir(0), dn_set(),
+      fcntl_locks(NULL), flock_locks(NULL)
   {
     memset(&dir_layout, 0, sizeof(dir_layout));
     memset(&layout, 0, sizeof(layout));
diff --git a/src/client/MetaRequest.h b/src/client/MetaRequest.h
index 45a90dc..6f82b5c 100644
--- a/src/client/MetaRequest.h
+++ b/src/client/MetaRequest.h
@@ -9,6 +9,7 @@
 #include "msg/msg_types.h"
 #include "include/xlist.h"
 #include "include/filepath.h"
+#include "include/atomic.h"
 #include "mds/mdstypes.h"
 
 #include "common/Mutex.h"
@@ -47,7 +48,7 @@ public:
   __u32    sent_on_mseq;       // mseq at last submission of this request
   int      num_fwd;            // # of times i've been forwarded
   int      retry_attempt;
-  int      ref;
+  atomic_t ref;
   
   MClientReply *reply;         // the reply
   bool kick;
@@ -126,17 +127,14 @@ public:
   Dentry *old_dentry();
 
   MetaRequest* get() {
-    ++ref;
+    ref.inc();
     return this;
   }
 
   /// psuedo-private put method; use Client::put_request()
-  void _put() {
-    if (--ref == 0)
-      delete this;
-  }
-  int get_num_ref() {
-    return ref;
+  bool _put() {
+    int v = ref.dec();
+    return v == 0;
   }
 
   // normal fields
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 7f419c3..4733912 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -636,6 +636,69 @@ static void fuse_ll_statfs(fuse_req_t req, fuse_ino_t ino)
   cfuse->iput(in); // iput required
 }
 
+static void fuse_ll_getlk(fuse_req_t req, fuse_ino_t ino,
+			  struct fuse_file_info *fi, struct flock *lock)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+  Fh *fh = (Fh*)fi->fh;
+
+  int r = cfuse->client->ll_getlk(fh, lock, fi->lock_owner);
+  if (r == 0)
+    fuse_reply_lock(req, lock);
+  else
+    fuse_reply_err(req, -r);
+}
+
+static void fuse_ll_setlk(fuse_req_t req, fuse_ino_t ino,
+		          struct fuse_file_info *fi, struct flock *lock, int sleep)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+  Fh *fh = (Fh*)fi->fh;
+
+  // must use multithread if operation may block
+  if (!cfuse->client->cct->_conf->fuse_multithreaded &&
+      sleep && lock->l_type != F_UNLCK) {
+    fuse_reply_err(req, EDEADLK);
+    return;
+  }
+
+  int r = cfuse->client->ll_setlk(fh, lock, fi->lock_owner, sleep, req);
+  fuse_reply_err(req, -r);
+}
+
+static void fuse_ll_interrupt(fuse_req_t req, void* data)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+  cfuse->client->ll_interrupt(data);
+}
+
+static void switch_interrupt_cb(void *req, void* data)
+{
+  if (data)
+    fuse_req_interrupt_func((fuse_req_t)req, fuse_ll_interrupt, data);
+  else
+    fuse_req_interrupt_func((fuse_req_t)req, NULL, NULL);
+}
+
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
+static void fuse_ll_flock(fuse_req_t req, fuse_ino_t ino,
+		          struct fuse_file_info *fi, int cmd)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+  Fh *fh = (Fh*)fi->fh;
+
+  // must use multithread if operation may block
+  if (!cfuse->client->cct->_conf->fuse_multithreaded &&
+      !(cmd & (LOCK_NB | LOCK_UN))) {
+    fuse_reply_err(req, EDEADLK);
+    return;
+  }
+
+  int r = cfuse->client->ll_flock(fh, cmd, fi->lock_owner, req);
+  fuse_reply_err(req, -r);
+}
+#endif
+
 #if 0
 static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
 {
@@ -742,8 +805,8 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = {
  removexattr: fuse_ll_removexattr,
  access: fuse_ll_access,
  create: fuse_ll_create,
- getlk: 0,
- setlk: 0,
+ getlk: fuse_ll_getlk,
+ setlk: fuse_ll_setlk,
  bmap: 0,
 #if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
 #ifdef FUSE_IOCTL_COMPAT
@@ -752,13 +815,15 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = {
  ioctl: 0,
 #endif
  poll: 0,
-#if FUSE_VERSION > FUSE_MAKE_VERSION(2, 9)
+#endif
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
  write_buf: 0,
  retrieve_reply: 0,
  forget_multi: 0,
- flock: 0,
- fallocate: fuse_ll_fallocate
+ flock: fuse_ll_flock,
 #endif
+#if FUSE_VERSION > FUSE_MAKE_VERSION(2, 9)
+ fallocate: fuse_ll_fallocate
 #endif
 };
 
@@ -859,6 +924,8 @@ int CephFuse::Handle::init(int argc, const char *argv[])
 
   fuse_session_add_chan(se, ch);
 
+  client->ll_register_switch_interrupt_cb(switch_interrupt_cb);
+
   /*
    * this is broken:
    *
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 69e5ad3..29a3135 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -83,7 +83,8 @@ libcommon_la_SOURCES += \
 	osd/HitSet.cc \
 	mds/MDSMap.cc \
 	mds/inode_backtrace.cc \
-	mds/mdstypes.cc 
+	mds/mdstypes.cc \
+	mds/flock.cc
 
 # inject crc in common
 libcommon_crc_la_SOURCES = \
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 7be0013..a962e06 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -51,7 +51,8 @@ void *Thread::entry_wrapper()
   int p = ceph_gettid(); // may return -ENOSYS on other platforms
   if (p > 0)
     pid = p;
-  if (ioprio_class >= 0 &&
+  if (pid &&
+      ioprio_class >= 0 &&
       ioprio_priority >= 0) {
     ceph_ioprio_set(IOPRIO_WHO_PROCESS,
 		    pid,
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 95f63b4..8173ca5 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -41,6 +41,7 @@ class Thread {
 
  public:
   const pthread_t &get_thread_id();
+  pid_t get_pid() const { return pid; }
   bool is_started();
   bool am_self();
   int kill(int signal);
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index ddb2f91..5a76f64 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -121,10 +121,10 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
   // caller checks;
   assert(tracking_enabled);
 
+  Mutex::Locker locker(ops_in_flight_lock);
   i->request->clear_data();
   i->request->clear_payload();
 
-  Mutex::Locker locker(ops_in_flight_lock);
   assert(i->xitem.get_list() == &ops_in_flight);
   utime_t now = ceph_clock_now(cct);
   i->xitem.remove_myself();
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
index 42f402f..0f8bc9d 100644
--- a/src/common/WorkQueue.cc
+++ b/src/common/WorkQueue.cc
@@ -271,6 +271,10 @@ void ThreadPool::set_ioprio(int cls, int priority)
   for (set<WorkThread*>::iterator p = _threads.begin();
        p != _threads.end();
        ++p) {
+    ldout(cct,10) << __func__ 
+		  << " class " << cls << " priority " << priority
+		  << " pid " << (*p)->get_pid()
+		  << dendl;
     int r = (*p)->set_ioprio(cls, priority);
     if (r < 0)
       lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
diff --git a/src/common/ceph_crypto.h b/src/common/ceph_crypto.h
index c553594..686efb4 100644
--- a/src/common/ceph_crypto.h
+++ b/src/common/ceph_crypto.h
@@ -78,9 +78,11 @@ namespace ceph {
 	assert(s == SECSuccess);
       }
       void Update (const byte *input, size_t length) {
-	SECStatus s;
-	s = PK11_DigestOp(ctx, input, length);
-	assert(s == SECSuccess);
+        if (length) {
+	  SECStatus s;
+	  s = PK11_DigestOp(ctx, input, length);
+	  assert(s == SECSuccess);
+        }
       }
       void Final (byte *digest) {
 	SECStatus s;
diff --git a/src/common/config.cc b/src/common/config.cc
index 23bfe35..fc47083 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -947,7 +947,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
 }
 
 static const char *CONF_METAVARIABLES[] =
-  { "cluster", "type", "name", "host", "num", "id", "pid" };
+  { "cluster", "type", "name", "host", "num", "id", "pid", "cctid" };
 static const int NUM_CONF_METAVARIABLES =
       (sizeof(CONF_METAVARIABLES) / sizeof(CONF_METAVARIABLES[0]));
 
@@ -1059,6 +1059,8 @@ bool md_config_t::expand_meta(std::string &origval,
 	  out += name.get_id().c_str();
 	else if (var == "pid")
 	  out += stringify(getpid());
+	else if (var == "cctid")
+	  out += stringify((unsigned long long)this);
 	else
 	  assert(0); // unreachable
 	expanded = true;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index fe00c76..0307441 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -92,6 +92,7 @@ SUBSYS(finisher, 1, 1)
 SUBSYS(heartbeatmap, 1, 5)
 SUBSYS(perfcounter, 1, 5)
 SUBSYS(rgw, 1, 5)                 // log level for the Rados gateway
+SUBSYS(civetweb, 1, 10)
 SUBSYS(javaclient, 1, 5)
 SUBSYS(asok, 1, 5)
 SUBSYS(throttle, 1, 1)
@@ -283,7 +284,7 @@ OPTION(fuse_default_permissions, OPT_BOOL, true)
 OPTION(fuse_big_writes, OPT_BOOL, true)
 OPTION(fuse_atomic_o_trunc, OPT_BOOL, true)
 OPTION(fuse_debug, OPT_BOOL, false)
-OPTION(fuse_multithreaded, OPT_BOOL, false)
+OPTION(fuse_multithreaded, OPT_BOOL, true)
 
 OPTION(crush_location, OPT_STR, "")       // whitespace-separated list of key=value pairs describing crush location
 
@@ -489,6 +490,9 @@ OPTION(osd_heartbeat_interval, OPT_INT, 6)       // (seconds) how often we ping
 OPTION(osd_heartbeat_grace, OPT_INT, 20)         // (seconds) how long before we decide a peer has failed
 OPTION(osd_heartbeat_min_peers, OPT_INT, 10)     // minimum number of peers
 
+// max number of parallel snap trims/pg
+OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2)
+
 // minimum number of peers tha tmust be reachable to mark ourselves
 // back up after being wrongly marked down.
 OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT, .33)
@@ -766,6 +770,7 @@ OPTION(rgw_cache_lru_size, OPT_INT, 10000)   // num of entries in rgw cache
 OPTION(rgw_socket_path, OPT_STR, "")   // path to unix domain socket, if not specified, rgw will not run as external fcgi
 OPTION(rgw_host, OPT_STR, "")  // host for radosgw, can be an IP, default is 0.0.0.0
 OPTION(rgw_port, OPT_STR, "")  // port to listen, format as "8080" "5000", if not specified, rgw will not run external fcgi
+OPTION(rgw_fcgi_explicit_free, OPT_BOOL, true) // whether to call FCGX_Free explicitly on every complete request
 OPTION(rgw_dns_name, OPT_STR, "")
 OPTION(rgw_script_uri, OPT_STR, "") // alternative value for SCRIPT_URI if not set in request
 OPTION(rgw_request_uri, OPT_STR,  "") // alternative value for REQUEST_URI if not set in request
diff --git a/src/common/crc32c_intel_fast_asm.S b/src/common/crc32c_intel_fast_asm.S
index 4ca5d65..2189684 100644
--- a/src/common/crc32c_intel_fast_asm.S
+++ b/src/common/crc32c_intel_fast_asm.S
@@ -662,3 +662,5 @@ global %1_slver
 %endmacro
 ;;;       func            core, ver, snum
 slversion crc32_iscsi_00, 00,   02,  0014
+; inform linker that this doesn't require executable stack
+section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/common/crc32c_intel_fast_zero_asm.S b/src/common/crc32c_intel_fast_zero_asm.S
index b7246f2..34b7f48 100644
--- a/src/common/crc32c_intel_fast_zero_asm.S
+++ b/src/common/crc32c_intel_fast_zero_asm.S
@@ -644,3 +644,5 @@ global %1_slver
 %endmacro
 ;;;       func            core, ver, snum
 slversion crc32_iscsi_zero_00, 00,   02,  0014
+; inform linker that this doesn't require executable stack
+section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index ecc8cfd..28cb86a 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -238,10 +238,10 @@ void ghobject_t::decode(bufferlist::iterator& bl)
 void ghobject_t::dump(Formatter *f) const
 {
   hobj.dump(f);
-  if (generation != NO_GEN) {
+  if (generation != NO_GEN)
     f->dump_int("generation", generation);
+  if (shard_id != ghobject_t::NO_SHARD)
     f->dump_int("shard_id", shard_id);
-  }
 }
 
 void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
diff --git a/src/common/io_priority.cc b/src/common/io_priority.cc
index b9eeae8..be4dc2a 100644
--- a/src/common/io_priority.cc
+++ b/src/common/io_priority.cc
@@ -41,8 +41,8 @@ int ceph_ioprio_set(int whence, int who, int ioprio)
 
 int ceph_ioprio_string_to_class(const std::string& s)
 {
-  std::string l;
-  std::transform(s.begin(), s.end(), l.begin(), ::tolower);
+  std::string l = s;
+  std::transform(l.begin(), l.end(), l.begin(), ::tolower);
 
   if (l == "idle")
     return IOPRIO_CLASS_IDLE;
diff --git a/src/common/util.cc b/src/common/util.cc
index ab417be..212384b 100644
--- a/src/common/util.cc
+++ b/src/common/util.cc
@@ -18,6 +18,10 @@
 #include "common/errno.h"
 #include "common/strtol.h"
 
+#ifdef HAVE_SYS_VFS_H
+#include <sys/vfs.h>
+#endif
+
 // test if an entire buf is zero in 8-byte chunks
 bool buf_is_zero(const char *buf, size_t len)
 {
@@ -104,3 +108,21 @@ int64_t unit_to_bytesize(string val, ostream *pss)
   }
   return (r * (1LL << modifier));
 }
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path)
+{
+  if (!path)
+    return -EINVAL;
+
+  struct statfs stbuf;
+  int err = ::statfs(path, &stbuf);
+  if (err < 0) {
+    return -errno;
+  }
+
+  stats.byte_total = stbuf.f_blocks * stbuf.f_bsize;
+  stats.byte_used = (stbuf.f_blocks - stbuf.f_bfree) * stbuf.f_bsize;
+  stats.byte_avail = stbuf.f_bavail * stbuf.f_bsize;
+  stats.avail_percent = (((float)stats.byte_avail/stats.byte_total)*100);
+  return 0;
+}
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
index b52a55a..33ed1db 100644
--- a/src/crush/CrushCompiler.cc
+++ b/src/crush/CrushCompiler.cc
@@ -191,6 +191,8 @@ int CrushCompiler::decompile(ostream &out)
     out << "tunable chooseleaf_descend_once " << crush.get_chooseleaf_descend_once() << "\n";
   if (crush.get_chooseleaf_vary_r() != 0)
     out << "tunable chooseleaf_vary_r " << crush.get_chooseleaf_vary_r() << "\n";
+  if (crush.get_straw_calc_version() != 0)
+    out << "tunable straw_calc_version " << crush.get_straw_calc_version() << "\n";
 
   out << "\n# devices\n";
   for (int i=0; i<crush.get_max_devices(); i++) {
@@ -368,6 +370,8 @@ int CrushCompiler::parse_tunable(iter_t const& i)
     crush.set_chooseleaf_descend_once(val);
   else if (name == "chooseleaf_vary_r")
     crush.set_chooseleaf_vary_r(val);
+  else if (name == "straw_calc_version")
+    crush.set_straw_calc_version(val);
   else {
     err << "tunable " << name << " not recognized" << std::endl;
     return -1;
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
index 23f1a7d..3b45e53 100644
--- a/src/crush/CrushTester.cc
+++ b/src/crush/CrushTester.cc
@@ -487,18 +487,18 @@ int CrushTester::test()
           vector<int> out;
 
           if (use_crush) {
-            if (output_statistics)
-              err << "CRUSH"; // prepend CRUSH to placement output
+            if (output_mappings)
+	      err << "CRUSH"; // prepend CRUSH to placement output
             crush.do_rule(r, x, out, nr, weight);
           } else {
-            if (output_statistics)
-              err << "RNG"; // prepend RNG to placement output to denote simulation
+            if (output_mappings)
+	      err << "RNG"; // prepend RNG to placement output to denote simulation
             // test our new monte carlo placement generator
             random_placement(r, out, nr, weight);
           }
 
-          if (output_statistics)
-            err << " rule " << r << " x " << x << " " << out << std::endl;
+	  if (output_mappings)
+	    err << " rule " << r << " x " << x << " " << out << std::endl;
 
           if (output_data_file)
             write_integer_indexed_vector_data_string(tester_data.placement_information, x, out);
@@ -539,14 +539,14 @@ int CrushTester::test()
 
       if (output_statistics)
         for (unsigned i = 0; i < per.size(); i++) {
-          if (output_utilization && num_batches > 1){
+          if (output_utilization) {
             if (num_objects_expected[i] > 0 && per[i] > 0) {
               err << "  device " << i << ":\t"
                   << "\t" << " stored " << ": " << per[i]
                   << "\t" << " expected " << ": " << num_objects_expected[i]
                   << std::endl;
             }
-          } else if (output_utilization_all && num_batches > 1) {
+          } else if (output_utilization_all) {
             err << "  device " << i << ":\t"
                 << "\t" << " stored " << ": " << per[i]
                 << "\t" << " expected " << ": " << num_objects_expected[i]
diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h
index df5a157..8de70f9 100644
--- a/src/crush/CrushTester.h
+++ b/src/crush/CrushTester.h
@@ -27,6 +27,7 @@ class CrushTester {
   bool output_utilization;
   bool output_utilization_all;
   bool output_statistics;
+  bool output_mappings;
   bool output_bad_mappings;
   bool output_choose_tries;
 
@@ -176,6 +177,7 @@ public:
       output_utilization(false),
       output_utilization_all(false),
       output_statistics(false),
+      output_mappings(false),
       output_bad_mappings(false),
       output_choose_tries(false),
       output_data_file(false),
@@ -226,6 +228,13 @@ public:
     return output_statistics;
   }
 
+  void set_output_mappings(bool b) {
+    output_mappings = b;
+  }
+  bool get_output_mappings() const {
+    return output_mappings;
+  }
+
   void set_output_bad_mappings(bool b) {
     output_bad_mappings = b;
   }
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 31da4f5..805a0c6 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -150,10 +150,10 @@ int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
     for (unsigned i=0; i<b->size; ++i) {
       int id = b->items[i];
       if (id == item) {
-	adjust_item_weight(cct, item, 0);
 	ldout(cct, 5) << "remove_item removing item " << item
 		      << " from bucket " << b->id << dendl;
-	crush_bucket_remove_item(b, item);
+	crush_bucket_remove_item(crush, b, item);
+	adjust_item_weight(cct, b->id, b->weight);
 	ret = 0;
       }
     }
@@ -171,8 +171,8 @@ bool CrushWrapper::_search_item_exists(int item) const
     if (!crush->buckets[i])
       continue;
     crush_bucket *b = crush->buckets[i];
-    for (unsigned i=0; i<b->size; ++i) {
-      if (b->items[i] == item)
+    for (unsigned j=0; j<b->size; ++j) {
+      if (b->items[j] == item)
 	return true;
     }
   }
@@ -197,9 +197,9 @@ int CrushWrapper::_remove_item_under(CephContext *cct, int item, int ancestor, b
   for (unsigned i=0; i<b->size; ++i) {
     int id = b->items[i];
     if (id == item) {
-      adjust_item_weight(cct, item, 0);
       ldout(cct, 5) << "_remove_item_under removing item " << item << " from bucket " << b->id << dendl;
-      crush_bucket_remove_item(b, item);
+      crush_bucket_remove_item(crush, b, item);
+      adjust_item_weight(cct, b->id, b->weight);
       ret = 0;
     } else if (id < 0) {
       int r = remove_item_under(cct, item, id, unlink_only);
@@ -459,6 +459,8 @@ int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string n
 
   int cur = item;
 
+  // create locations if locations don't exist and add child in location with 0 weight
+  // the more detail in the insert_item method declaration in CrushWrapper.h
   for (map<int,string>::iterator p = type_map.begin(); p != type_map.end(); ++p) {
     // ignore device type
     if (p->first == 0)
@@ -518,17 +520,17 @@ int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string n
 
     ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
 		  << " to bucket " << id << dendl;
-    int r = crush_bucket_add_item(b, cur, 0);
+    int r = crush_bucket_add_item(crush, b, cur, 0);
     assert (!r);
+    break;
+  }
 
-    // now that we've added the (0-weighted) item and any parent buckets, adjust the weight.
-    adjust_item_weightf(cct, item, weight);
-
+  // adjust the item's weight in location
+  if(adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
     if (item >= crush->max_devices) {
       crush->max_devices = item + 1;
       ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices << dendl;
     }
-
     return 0;
   }
 
@@ -585,7 +587,7 @@ int CrushWrapper::create_or_move_item(CephContext *cct, int item, float weight,
   if (check_item_loc(cct, item, loc, &old_iweight)) {
     ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc << dendl;
   } else {
-    if (item_exists(item)) {
+    if (_search_item_exists(item)) {
       weight = get_item_weightf(item);
       ldout(cct, 10) << "create_or_move_item " << item << " exists with weight " << weight << dendl;
       remove_item(cct, item, true);
@@ -620,7 +622,7 @@ int CrushWrapper::update_item(CephContext *cct, int item, float weight, string n
     if (old_iweight != iweight) {
       ldout(cct, 5) << "update_item " << item << " adjusting weight "
 		    << ((float)old_iweight/(float)0x10000) << " -> " << weight << dendl;
-      adjust_item_weight(cct, item, iweight);
+      adjust_item_weight_in_loc(cct, item, iweight, loc);
       ret = 1;
     }
     if (get_item_name(item) != name) {
@@ -641,7 +643,7 @@ int CrushWrapper::update_item(CephContext *cct, int item, float weight, string n
   return ret;
 }
 
-int CrushWrapper::get_item_weight(int id)
+int CrushWrapper::get_item_weight(int id) const
 {
   for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
     crush_bucket *b = crush->buckets[bidx];
@@ -654,6 +656,24 @@ int CrushWrapper::get_item_weight(int id)
   return -ENOENT;
 }
 
+int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
+{
+  for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); l++) {
+    int bid = get_item_id(l->second);
+    if (!bucket_exists(bid))
+      continue;
+    crush_bucket *b = get_bucket(bid);
+    if ( b == NULL)
+      continue;
+    for (unsigned int i = 0; i < b->size; i++) {
+      if (b->items[i] == id) {
+	return crush_get_bucket_item_weight(b, i);
+      }
+    }
+  }
+  return -ENOENT;
+}
+
 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
 {
   ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
@@ -664,7 +684,7 @@ int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
       continue;
     for (unsigned i = 0; i < b->size; i++) {
       if (b->items[i] == id) {
-	int diff = crush_bucket_adjust_item_weight(b, id, weight);
+	int diff = crush_bucket_adjust_item_weight(crush, b, id, weight);
 	ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff << " in bucket " << bidx << dendl;
 	adjust_item_weight(cct, -1 - bidx, b->weight);
 	changed++;
@@ -676,7 +696,33 @@ int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
   return changed;
 }
 
-bool CrushWrapper::check_item_present(int id)
+int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
+{
+  ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight << " in " << loc << dendl;
+  int changed = 0;
+
+  for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); l++) {
+    int bid = get_item_id(l->second);
+    if (!bucket_exists(bid))
+      continue;
+    crush_bucket *b = get_bucket(bid);
+    if ( b == NULL)
+      continue;
+    for (unsigned int i = 0; i < b->size; i++) {
+      if (b->items[i] == id) {
+	int diff = crush_bucket_adjust_item_weight(crush, b, id, weight);
+	ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff << " in bucket " << bid << dendl;
+	adjust_item_weight(cct, bid, b->weight);
+	changed++;
+      }
+    }
+  }
+  if (!changed)
+    return -ENOENT;
+  return changed;
+}
+
+bool CrushWrapper::check_item_present(int id) const
 {
   bool found = false;
 
@@ -778,20 +824,18 @@ int CrushWrapper::add_simple_ruleset(string name, string root_name,
     return -EINVAL;
   }
 
-  int ruleset = 0;
-  for (int i = 0; i < get_max_rules(); i++) {
-    if (rule_exists(i) &&
-	get_rule_mask_ruleset(i) >= ruleset) {
-      ruleset = get_rule_mask_ruleset(i) + 1;
-    }
+  int rno = -1;
+  for (rno = 0; rno < get_max_rules(); rno++) {
+    if (!rule_exists(rno) && !ruleset_exists(rno))
+       break;
   }
-
   int steps = 3;
   if (mode == "indep")
     steps = 4;
   int min_rep = mode == "firstn" ? 1 : 3;
   int max_rep = mode == "firstn" ? 10 : 20;
-  crush_rule *rule = crush_make_rule(steps, ruleset, rule_type, min_rep, max_rep);
+  //set the ruleset the same as rule_id(rno)
+  crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
   assert(rule);
   int step = 0;
   if (mode == "indep")
@@ -810,7 +854,12 @@ int CrushWrapper::add_simple_ruleset(string name, string root_name,
 			CRUSH_CHOOSE_N,
 			0);
   crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
-  int rno = crush_add_rule(crush, rule, -1);
+
+  int ret = crush_add_rule(crush, rule, rno);
+  if(ret < 0) {
+    *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
+    return ret;
+  }
   set_rule_name(rno, name);
   have_rmaps = false;
   return rno;
@@ -965,6 +1014,7 @@ void CrushWrapper::encode(bufferlist& bl, bool lean) const
   ::encode(crush->choose_total_tries, bl);
   ::encode(crush->chooseleaf_descend_once, bl);
   ::encode(crush->chooseleaf_vary_r, bl);
+  ::encode(crush->straw_calc_version, bl);
 }
 
 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
@@ -1048,6 +1098,9 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
     if (!blp.end()) {
       ::decode(crush->chooseleaf_vary_r, blp);
     }
+    if (!blp.end()) {
+      ::decode(crush->straw_calc_version, blp);
+    }
     finalize();
   }
   catch (...) {
@@ -1231,6 +1284,8 @@ void CrushWrapper::dump_tunables(Formatter *f) const
   f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
   f->dump_int("choose_total_tries", get_choose_total_tries());
   f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
+  f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
+  f->dump_int("straw_calc_version", get_straw_calc_version());
 
   // be helpful about it
   if (has_firefly_tunables())
@@ -1246,6 +1301,9 @@ void CrushWrapper::dump_tunables(Formatter *f) const
 
   f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
   f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
+  f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
+  f->dump_int("has_v2_rules", (int)has_v2_rules());
+  f->dump_int("has_v3_rules", (int)has_v3_rules());
 }
 
 void CrushWrapper::dump_rules(Formatter *f) const
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index d5d4f4f..9fac2fe 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -52,24 +52,23 @@ using namespace std;
 class CrushWrapper {
   mutable Mutex mapper_lock;
 public:
-  struct crush_map *crush;
   std::map<int32_t, string> type_map; /* bucket/device type names */
   std::map<int32_t, string> name_map; /* bucket/device names */
   std::map<int32_t, string> rule_name_map;
 
-  /* reverse maps */
-  bool have_rmaps;
-  std::map<string, int> type_rmap, name_rmap, rule_name_rmap;
-
 private:
-  void build_rmaps() {
+  struct crush_map *crush;
+  /* reverse maps */
+  mutable bool have_rmaps;
+  mutable std::map<string, int> type_rmap, name_rmap, rule_name_rmap;
+  void build_rmaps() const {
     if (have_rmaps) return;
     build_rmap(type_map, type_rmap);
     build_rmap(name_map, name_rmap);
     build_rmap(rule_name_map, rule_name_rmap);
     have_rmaps = true;
   }
-  void build_rmap(const map<int, string> &f, std::map<string, int> &r) {
+  void build_rmap(const map<int, string> &f, std::map<string, int> &r) const {
     r.clear();
     for (std::map<int, string>::const_iterator p = f.begin(); p != f.end(); ++p)
       r[p->second] = p->first;
@@ -88,6 +87,8 @@ public:
       crush_destroy(crush);
   }
 
+  crush_map *get_crush_map() { return crush; }
+
   /* building */
   void create() {
     if (crush)
@@ -124,12 +125,15 @@ public:
 
   void set_tunables_legacy() {
     set_tunables_argonaut();
+    crush->straw_calc_version = 0;
   }
   void set_tunables_optimal() {
     set_tunables_firefly();
+    crush->straw_calc_version = 1;
   }
   void set_tunables_default() {
     set_tunables_bobtail();
+    crush->straw_calc_version = 1;
   }
 
   int get_choose_local_tries() const {
@@ -167,13 +171,21 @@ public:
     crush->chooseleaf_vary_r = n;
   }
 
+  int get_straw_calc_version() const {
+    return crush->straw_calc_version;
+  }
+  void set_straw_calc_version(int n) {
+    crush->straw_calc_version = n;
+  }
+
   bool has_argonaut_tunables() const {
     return
       crush->choose_local_tries == 2 &&
       crush->choose_local_fallback_tries == 5 &&
       crush->choose_total_tries == 19 &&
       crush->chooseleaf_descend_once == 0 &&
-      crush->chooseleaf_vary_r == 0;
+      crush->chooseleaf_vary_r == 0 &&
+      crush->straw_calc_version == 0;
   }
   bool has_bobtail_tunables() const {
     return
@@ -181,7 +193,8 @@ public:
       crush->choose_local_fallback_tries == 0 &&
       crush->choose_total_tries == 50 &&
       crush->chooseleaf_descend_once == 1 &&
-      crush->chooseleaf_vary_r == 0;
+      crush->chooseleaf_vary_r == 0 &&
+      crush->straw_calc_version == 0;
   }
   bool has_firefly_tunables() const {
     return
@@ -189,7 +202,8 @@ public:
       crush->choose_local_fallback_tries == 0 &&
       crush->choose_total_tries == 50 &&
       crush->chooseleaf_descend_once == 1 &&
-      crush->chooseleaf_vary_r == 1;
+      crush->chooseleaf_vary_r == 1 &&
+      crush->straw_calc_version == 0;
   }
 
   bool has_optimal_tunables() const {
@@ -223,7 +237,7 @@ public:
   int get_num_type_names() const {
     return type_map.size();
   }
-  int get_type_id(const string& name) {
+  int get_type_id(const string& name) const {
     build_rmaps();
     if (type_rmap.count(name))
       return type_rmap[name];
@@ -242,14 +256,14 @@ public:
   }
 
   // item/bucket names
-  bool name_exists(const string& name) {
+  bool name_exists(const string& name) const {
     build_rmaps();
     return name_rmap.count(name);
   }
   bool item_exists(int i) {
     return name_map.count(i);
   }
-  int get_item_id(const string& name) {
+  int get_item_id(const string& name) const {
     build_rmaps();
     if (name_rmap.count(name))
       return name_rmap[name];
@@ -271,11 +285,11 @@ public:
   }
 
   // rule names
-  bool rule_exists(string name) {
+  bool rule_exists(string name) const {
     build_rmaps();
     return rule_name_rmap.count(name);
   }
-  int get_rule_id(string name) {
+  int get_rule_id(string name) const {
     build_rmaps();
     if (rule_name_rmap.count(name))
       return rule_name_rmap[name];
@@ -542,19 +556,27 @@ public:
    * @param id item id to check
    * @return weight of item
    */
-  int get_item_weight(int id);
-  float get_item_weightf(int id) {
+  int get_item_weight(int id) const;
+  float get_item_weightf(int id) const {
     return (float)get_item_weight(id) / (float)0x10000;
   }
+  int get_item_weight_in_loc(int id, const map<string,string> &loc);
+  float get_item_weightf_in_loc(int id, const map<string,string> &loc) {
+    return (float)get_item_weight_in_loc(id, loc) / (float)0x10000;
+  }
 
   int adjust_item_weight(CephContext *cct, int id, int weight);
   int adjust_item_weightf(CephContext *cct, int id, float weight) {
     return adjust_item_weight(cct, id, (int)(weight * (float)0x10000));
   }
+  int adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc);
+  int adjust_item_weightf_in_loc(CephContext *cct, int id, float weight, const map<string,string>& loc) {
+    return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000), loc);
+  }
   void reweight(CephContext *cct);
 
   /// check if item id is present in the map hierarchy
-  bool check_item_present(int id);
+  bool check_item_present(int id) const;
 
 
   /*** devices ***/
@@ -745,9 +767,6 @@ private:
     crush_bucket *b = get_bucket(item);
     unsigned bucket_weight = b->weight;
 
-    // zero out the bucket weight
-    adjust_item_weight(cct, item, 0);
-
     // get where the bucket is located
     pair<string, string> bucket_location = get_immediate_parent(item);
 
@@ -758,8 +777,12 @@ private:
     crush_bucket *parent_bucket = get_bucket(parent_id);
 
     if (!IS_ERR(parent_bucket)) {
+      // zero out the bucket weight
+      crush_bucket_adjust_item_weight(crush, parent_bucket, item, 0);
+      adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
+
       // remove the bucket from the parent
-      crush_bucket_remove_item(parent_bucket, item);
+      crush_bucket_remove_item(crush, parent_bucket, item);
     } else if (PTR_ERR(parent_bucket) != -ENOENT) {
       return PTR_ERR(parent_bucket);
     }
@@ -839,7 +862,7 @@ public:
 		 int *items, int *weights, int *idout) {
     if (type == 0)
       return -EINVAL;
-    crush_bucket *b = crush_make_bucket(alg, hash, type, size, items, weights);
+    crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items, weights);
     assert(b);
     return crush_add_bucket(crush, bucketno, b, idout);
   }
@@ -880,9 +903,9 @@ public:
 
   bool ruleset_exists(int ruleset) const {
     for (size_t i = 0; i < crush->max_rules; ++i) {
-     if (crush->rules[i]->mask.ruleset == ruleset) {
-       return true;
-     }
+      if (rule_exists(i) && crush->rules[i]->mask.ruleset == ruleset) {
+	return true;
+      }
     }
 
     return false;
diff --git a/src/crush/builder.c b/src/crush/builder.c
index eff0bf6..f081562 100644
--- a/src/crush/builder.c
+++ b/src/crush/builder.c
@@ -11,6 +11,8 @@
 #include "builder.h"
 #include "hash.h"
 
+#define dprintk(args...) /* printf(args) */
+
 #define BUG_ON(x) assert(!(x))
 
 struct crush_map *crush_create()
@@ -27,6 +29,7 @@ struct crush_map *crush_create()
 	m->choose_total_tries = 19;
 	m->chooseleaf_descend_once = 0;
 	m->chooseleaf_vary_r = 0;
+	m->straw_calc_version = 0;
 	return m;
 }
 
@@ -63,7 +66,7 @@ int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
 		for (r=0; r < map->max_rules; r++)
 			if (map->rules[r] == 0)
 				break;
-		assert(r <= INT_MAX);
+		assert(r < CRUSH_MAX_RULES);
 	}
 	else
 		r = ruleno;
@@ -72,6 +75,8 @@ int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
 		/* expand array */
 		int oldsize;
 		void *_realloc = NULL;
+		if (map->max_rules +1 > CRUSH_MAX_RULES)
+			return -ENOSPC;
 		oldsize = map->max_rules;
 		map->max_rules = r+1;
 		if ((_realloc = realloc(map->rules, map->max_rules * sizeof(map->rules[0]))) == NULL) {
@@ -263,7 +268,7 @@ crush_make_list_bucket(int hash, int type, int size,
 
 		w += weights[i];
 		bucket->sum_weights[i] = w;
-		/*printf("pos %d item %d weight %d sum %d\n",
+		/*dprintk("pos %d item %d weight %d sum %d\n",
 		  i, items[i], weights[i], bucket->sum_weights[i]);*/
 	}
 
@@ -304,6 +309,10 @@ static int parent(int n)
 
 static int calc_depth(int size)
 {
+	if (size == 0) {
+		return 0;
+	}
+
 	int depth = 1;
 	int t = size - 1;
 	while (t) {
@@ -332,6 +341,16 @@ crush_make_tree_bucket(int hash, int type, int size,
 	bucket->h.type = type;
 	bucket->h.size = size;
 
+	if (size == 0) {
+		bucket->h.items = NULL;
+		bucket->h.perm = NULL;
+		bucket->h.weight = 0;
+		bucket->node_weights = NULL;
+		bucket->num_nodes = 0;
+		/* printf("size 0 depth 0 nodes 0\n"); */
+		return bucket;
+	}
+
 	bucket->h.items = malloc(sizeof(__s32)*size);
         if (!bucket->h.items)
                 goto err;
@@ -342,7 +361,7 @@ crush_make_tree_bucket(int hash, int type, int size,
 	/* calc tree depth */
 	depth = calc_depth(size);
 	bucket->num_nodes = 1 << depth;
-	printf("size %d depth %d nodes %d\n", size, depth, bucket->num_nodes);
+	dprintk("size %d depth %d nodes %d\n", size, depth, bucket->num_nodes);
 
         bucket->node_weights = malloc(sizeof(__u32)*bucket->num_nodes);
         if (!bucket->node_weights)
@@ -354,7 +373,7 @@ crush_make_tree_bucket(int hash, int type, int size,
 	for (i=0; i<size; i++) {
 		bucket->h.items[i] = items[i];
 		node = crush_calc_tree_node(i);
-		printf("item %d node %d weight %d\n", i, node, weights[i]);
+		dprintk("item %d node %d weight %d\n", i, node, weights[i]);
 		bucket->node_weights[node] = weights[i];
 
 		if (crush_addition_is_unsafe(bucket->h.weight, weights[i]))
@@ -368,7 +387,7 @@ crush_make_tree_bucket(int hash, int type, int size,
                                 goto err;
 
 			bucket->node_weights[node] += weights[i];
-			printf(" node %d weight %d\n", node, bucket->node_weights[node]);
+			dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
 		}
 	}
 	BUG_ON(bucket->node_weights[bucket->num_nodes/2] != bucket->h.weight);
@@ -386,7 +405,34 @@ err:
 
 /* straw bucket */
 
-int crush_calc_straw(struct crush_bucket_straw *bucket)
+/*
+ * this code was written 8 years ago.  i have a vague recollection of
+ * drawing boxes underneath bars of different lengths, where the bar
+ * length represented the probability/weight, and that there was some
+ * trial and error involved in arriving at this implementation.
+ * however, reading the code now after all this time, the intuition
+ * that motivated is lost on me.  lame.  my only excuse is that I now
+ * know that the approach is fundamentally flawed and am not
+ * particularly motivated to reconstruct the flawed reasoning.
+ *
+ * as best as i can remember, the idea is: sort the weights, and start
+ * with the smallest.  arbitrarily scale it at 1.0 (16-bit fixed
+ * point).  look at the next larger weight, and calculate the scaling
+ * factor for that straw based on the relative difference in weight so
+ * far.  what's not clear to me now is why we are looking at wnext
+ * (the delta to the next bigger weight) for all remaining weights,
+ * and slicing things horizontally instead of considering just the
+ * next item or set of items.  or why pow() is used the way it is.
+ *
+ * note that the original version 1 of this function made special
+ * accomodation for the case where straw lengths were identical.  this
+ * is also flawed in a non-obvious way; version 2 drops the special
+ * handling and appears to work just as well.
+ *
+ * moral of the story: if you do something clever, write down why it
+ * works.
+ */
+int crush_calc_straw(struct crush_map *map, struct crush_bucket_straw *bucket)
 {
 	int *reverse;
 	int i, j, k;
@@ -422,41 +468,82 @@ int crush_calc_straw(struct crush_bucket_straw *bucket)
 
 	i=0;
 	while (i < size) {
-		/* zero weight items get 0 length straws! */
-		if (weights[reverse[i]] == 0) {
-			bucket->straws[reverse[i]] = 0;
+		if (map->straw_calc_version == 0) {
+			/* zero weight items get 0 length straws! */
+			if (weights[reverse[i]] == 0) {
+				bucket->straws[reverse[i]] = 0;
+				i++;
+				continue;
+			}
+
+			/* set this item's straw */
+			bucket->straws[reverse[i]] = straw * 0x10000;
+			dprintk("item %d at %d weight %d straw %d (%lf)\n",
+				bucket->h.items[reverse[i]],
+				reverse[i], weights[reverse[i]],
+				bucket->straws[reverse[i]], straw);
 			i++;
-			continue;
-		}
+			if (i == size)
+				break;
 
-		/* set this item's straw */
-		bucket->straws[reverse[i]] = straw * 0x10000;
-		/*printf("item %d at %d weight %d straw %d (%lf)\n",
-		       items[reverse[i]],
-		       reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/
-		i++;
-		if (i == size) break;
-
-		/* same weight as previous? */
-		if (weights[reverse[i]] == weights[reverse[i-1]]) {
-			/*printf("same as previous\n");*/
-			continue;
-		}
+			/* same weight as previous? */
+			if (weights[reverse[i]] == weights[reverse[i-1]]) {
+				dprintk("same as previous\n");
+				continue;
+			}
 
-		/* adjust straw for next guy */
-		wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft;
-		for (j=i; j<size; j++)
-			if (weights[reverse[j]] == weights[reverse[i]])
+			/* adjust straw for next guy */
+			wbelow += ((double)weights[reverse[i-1]] - lastw) *
+				numleft;
+			for (j=i; j<size; j++)
+				if (weights[reverse[j]] == weights[reverse[i]])
+					numleft--;
+				else
+					break;
+			wnext = numleft * (weights[reverse[i]] -
+					   weights[reverse[i-1]]);
+			pbelow = wbelow / (wbelow + wnext);
+			dprintk("wbelow %lf  wnext %lf  pbelow %lf  numleft %d\n",
+				wbelow, wnext, pbelow, numleft);
+
+			straw *= pow((double)1.0 / pbelow, (double)1.0 /
+				     (double)numleft);
+
+			lastw = weights[reverse[i-1]];
+		} else if (map->straw_calc_version >= 1) {
+			/* zero weight items get 0 length straws! */
+			if (weights[reverse[i]] == 0) {
+				bucket->straws[reverse[i]] = 0;
+				i++;
 				numleft--;
-			else
+				continue;
+			}
+
+			/* set this item's straw */
+			bucket->straws[reverse[i]] = straw * 0x10000;
+			dprintk("item %d at %d weight %d straw %d (%lf)\n",
+				bucket->h.items[reverse[i]],
+				reverse[i], weights[reverse[i]],
+				bucket->straws[reverse[i]], straw);
+			i++;
+			if (i == size)
 				break;
-		wnext = numleft * (weights[reverse[i]] - weights[reverse[i-1]]);
-		pbelow = wbelow / (wbelow + wnext);
-		/*printf("wbelow %lf  wnext %lf  pbelow %lf\n", wbelow, wnext, pbelow);*/
 
-		straw *= pow((double)1.0 / pbelow, (double)1.0 / (double)numleft);
+			/* adjust straw for next guy */
+			wbelow += ((double)weights[reverse[i-1]] - lastw) *
+				numleft;
+			numleft--;
+			wnext = numleft * (weights[reverse[i]] -
+					   weights[reverse[i-1]]);
+			pbelow = wbelow / (wbelow + wnext);
+			dprintk("wbelow %lf  wnext %lf  pbelow %lf  numleft %d\n",
+				wbelow, wnext, pbelow, numleft);
+
+			straw *= pow((double)1.0 / pbelow, (double)1.0 /
+				     (double)numleft);
 
-		lastw = weights[reverse[i-1]];
+			lastw = weights[reverse[i-1]];
+		}
 	}
 
 	free(reverse);
@@ -464,7 +551,8 @@ int crush_calc_straw(struct crush_bucket_straw *bucket)
 }
 
 struct crush_bucket_straw *
-crush_make_straw_bucket(int hash, 
+crush_make_straw_bucket(struct crush_map *map,
+			int hash,
 			int type,
 			int size,
 			int *items,
@@ -502,7 +590,7 @@ crush_make_straw_bucket(int hash,
 		bucket->item_weights[i] = weights[i];
 	}
 
-        if (crush_calc_straw(bucket) < 0)
+        if (crush_calc_straw(map, bucket) < 0)
                 goto err;
 
 	return bucket;
@@ -518,7 +606,8 @@ err:
 
 
 struct crush_bucket*
-crush_make_bucket(int alg, int hash, int type, int size,
+crush_make_bucket(struct crush_map *map,
+		  int alg, int hash, int type, int size,
 		  int *items,
 		  int *weights)
 {
@@ -539,7 +628,7 @@ crush_make_bucket(int alg, int hash, int type, int size,
 		return (struct crush_bucket *)crush_make_tree_bucket(hash, type, size, items, weights);
 
 	case CRUSH_BUCKET_STRAW:
-		return (struct crush_bucket *)crush_make_straw_bucket(hash, type, size, items, weights);
+		return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights);
 	}
 	return 0;
 }
@@ -648,27 +737,39 @@ int crush_add_tree_bucket_item(struct crush_bucket_tree *bucket, int item, int w
 	node = crush_calc_tree_node(newsize-1);
 	bucket->node_weights[node] = weight;
 
+	/* if the depth increase, we need to initialize the new root node's weight before add bucket item */
+	int root = bucket->num_nodes/2;
+	if (depth >= 2 && (node - 1) == root) {
+		/* if the new item is the first node in right sub tree, so
+		* the root node initial weight is left sub tree's weight
+		*/
+		bucket->node_weights[root] = bucket->node_weights[root/2];
+	}
+
 	for (j=1; j<depth; j++) {
 		node = parent(node);
 
-                if (!crush_addition_is_unsafe(bucket->node_weights[node], weight))
+                if (crush_addition_is_unsafe(bucket->node_weights[node], weight))
                         return -ERANGE;
 
 		bucket->node_weights[node] += weight;
-                printf(" node %d weight %d\n", node, bucket->node_weights[node]);
+                dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
 	}
 
 
 	if (crush_addition_is_unsafe(bucket->h.weight, weight))
                 return -ERANGE;
 	
+	bucket->h.items[newsize-1] = item;
         bucket->h.weight += weight;
         bucket->h.size++;
 
 	return 0;
 }
 
-int crush_add_straw_bucket_item(struct crush_bucket_straw *bucket, int item, int weight)
+int crush_add_straw_bucket_item(struct crush_map *map,
+				struct crush_bucket_straw *bucket,
+				int item, int weight)
 {
 	int newsize = bucket->h.size + 1;
 	
@@ -701,13 +802,14 @@ int crush_add_straw_bucket_item(struct crush_bucket_straw *bucket, int item, int
 	if (crush_addition_is_unsafe(bucket->h.weight, weight))
                 return -ERANGE;
 
-	 bucket->h.weight += weight;
-	 bucket->h.size++;
+	bucket->h.weight += weight;
+	bucket->h.size++;
 	
-	return crush_calc_straw(bucket);
+	return crush_calc_straw(map, bucket);
 }
 
-int crush_bucket_add_item(struct crush_bucket *b, int item, int weight)
+int crush_bucket_add_item(struct crush_map *map,
+			  struct crush_bucket *b, int item, int weight)
 {
 	/* invalidate perm cache */
 	b->perm_n = 0;
@@ -720,7 +822,7 @@ int crush_bucket_add_item(struct crush_bucket *b, int item, int weight)
 	case CRUSH_BUCKET_TREE:
 		return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight);
 	case CRUSH_BUCKET_STRAW:
-		return crush_add_straw_bucket_item((struct crush_bucket_straw *)b, item, weight);
+		return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight);
 	default:
 		return -1;
 	}
@@ -744,7 +846,10 @@ int crush_remove_uniform_bucket_item(struct crush_bucket_uniform *bucket, int it
 	for (j = i; j < bucket->h.size; j++)
 		bucket->h.items[j] = bucket->h.items[j+1];
 	newsize = --bucket->h.size;
-	bucket->h.weight -= bucket->item_weight;
+	if (bucket->item_weight < bucket->h.weight)
+		bucket->h.weight -= bucket->item_weight;
+	else
+		bucket->h.weight = 0;
 
 	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
 		return -ENOMEM;
@@ -763,7 +868,7 @@ int crush_remove_list_bucket_item(struct crush_bucket_list *bucket, int item)
 {
 	unsigned i, j;
 	int newsize;
-	int weight;
+	unsigned weight;
 
 	for (i = 0; i < bucket->h.size; i++)
 		if (bucket->h.items[i] == item)
@@ -777,7 +882,10 @@ int crush_remove_list_bucket_item(struct crush_bucket_list *bucket, int item)
 		bucket->item_weights[j] = bucket->item_weights[j+1];
 		bucket->sum_weights[j] = bucket->sum_weights[j+1] - weight;
 	}
-	bucket->h.weight -= weight;
+	if (weight < bucket->h.weight)
+		bucket->h.weight -= weight;
+	else
+		bucket->h.weight = 0;
 	newsize = --bucket->h.size;
 	
 	void *_realloc = NULL;
@@ -812,7 +920,7 @@ int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item)
 
 	for (i = 0; i < bucket->h.size; i++) {
 		int node;
-		int weight;
+		unsigned weight;
 		int j;
 		int depth = calc_depth(bucket->h.size);
 
@@ -826,9 +934,12 @@ int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item)
 		for (j = 1; j < depth; j++) {
 			node = parent(node);
 			bucket->node_weights[node] -= weight;
-			printf(" node %d weight %d\n", node, bucket->node_weights[node]);
+			dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
 		}
-		bucket->h.weight -= weight;
+		if (weight < bucket->h.weight)
+			bucket->h.weight -= weight;
+		else
+			bucket->h.weight = 0;
 		break;
 	}
 	if (i == bucket->h.size)
@@ -875,7 +986,8 @@ int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item)
 	return 0;
 }
 
-int crush_remove_straw_bucket_item(struct crush_bucket_straw *bucket, int item)
+int crush_remove_straw_bucket_item(struct crush_map *map,
+				   struct crush_bucket_straw *bucket, int item)
 {
 	int newsize = bucket->h.size - 1;
 	unsigned i, j;
@@ -883,7 +995,10 @@ int crush_remove_straw_bucket_item(struct crush_bucket_straw *bucket, int item)
 	for (i = 0; i < bucket->h.size; i++) {
 		if (bucket->h.items[i] == item) {
 			bucket->h.size--;
-			bucket->h.weight -= bucket->item_weights[i];
+			if (bucket->item_weights[i] < bucket->h.weight)
+				bucket->h.weight -= bucket->item_weights[i];
+			else
+				bucket->h.weight = 0;
 			for (j = i; j < bucket->h.size; j++) {
 				bucket->h.items[j] = bucket->h.items[j+1];
 				bucket->item_weights[j] = bucket->item_weights[j+1];
@@ -917,10 +1032,10 @@ int crush_remove_straw_bucket_item(struct crush_bucket_straw *bucket, int item)
 		bucket->straws = _realloc;
 	}
 
-	return crush_calc_straw(bucket);
+	return crush_calc_straw(map, bucket);
 }
 
-int crush_bucket_remove_item(struct crush_bucket *b, int item)
+int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *b, int item)
 {
 	/* invalidate perm cache */
 	b->perm_n = 0;
@@ -933,7 +1048,7 @@ int crush_bucket_remove_item(struct crush_bucket *b, int item)
 	case CRUSH_BUCKET_TREE:
 		return crush_remove_tree_bucket_item((struct crush_bucket_tree *)b, item);
 	case CRUSH_BUCKET_STRAW:
-		return crush_remove_straw_bucket_item((struct crush_bucket_straw *)b, item);
+		return crush_remove_straw_bucket_item(map, (struct crush_bucket_straw *)b, item);
 	default:
 		return -1;
 	}
@@ -1002,7 +1117,9 @@ int crush_adjust_tree_bucket_item_weight(struct crush_bucket_tree *bucket, int i
 	return diff;
 }
 
-int crush_adjust_straw_bucket_item_weight(struct crush_bucket_straw *bucket, int item, int weight)
+int crush_adjust_straw_bucket_item_weight(struct crush_map *map,
+					  struct crush_bucket_straw *bucket,
+					  int item, int weight)
 {
 	unsigned idx;
 	int diff;
@@ -1018,14 +1135,16 @@ int crush_adjust_straw_bucket_item_weight(struct crush_bucket_straw *bucket, int
 	bucket->item_weights[idx] = weight;
 	bucket->h.weight += diff;
 
-	r = crush_calc_straw(bucket);
+	r = crush_calc_straw(map, bucket);
         if (r < 0)
                 return r;
 
 	return diff;
 }
 
-int crush_bucket_adjust_item_weight(struct crush_bucket *b, int item, int weight)
+int crush_bucket_adjust_item_weight(struct crush_map *map,
+				    struct crush_bucket *b,
+				    int item, int weight)
 {
 	switch (b->alg) {
 	case CRUSH_BUCKET_UNIFORM:
@@ -1038,7 +1157,8 @@ int crush_bucket_adjust_item_weight(struct crush_bucket *b, int item, int weight
 		return crush_adjust_tree_bucket_item_weight((struct crush_bucket_tree *)b,
 							    item, weight);
 	case CRUSH_BUCKET_STRAW:
-		return crush_adjust_straw_bucket_item_weight((struct crush_bucket_straw *)b,
+		return crush_adjust_straw_bucket_item_weight(map,
+							     (struct crush_bucket_straw *)b,
 							     item, weight);
 	default:
 		return -1;
@@ -1141,6 +1261,7 @@ static int crush_reweight_straw_bucket(struct crush_map *crush, struct crush_buc
 
                 bucket->h.weight += bucket->item_weights[i];
 	}
+	crush_calc_straw(crush, bucket);
 
 	return 0;
 }
diff --git a/src/crush/builder.h b/src/crush/builder.h
index 1003c35..efd7c8a 100644
--- a/src/crush/builder.h
+++ b/src/crush/builder.h
@@ -16,12 +16,12 @@ extern int crush_get_next_bucket_id(struct crush_map *map);
 extern int crush_add_bucket(struct crush_map *map,
 			    int bucketno,
 			    struct crush_bucket *bucket, int *idout);
-struct crush_bucket *crush_make_bucket(int alg, int hash, int type, int size, int *items, int *weights);
-extern int crush_bucket_add_item(struct crush_bucket *bucket, int item, int weight);
-extern int crush_bucket_adjust_item_weight(struct crush_bucket *bucket, int item, int weight);
+struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights);
+extern int crush_bucket_add_item(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
+extern int crush_bucket_adjust_item_weight(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
 extern int crush_reweight_bucket(struct crush_map *crush, struct crush_bucket *bucket);
 extern int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket);
-extern int crush_bucket_remove_item(struct crush_bucket *bucket, int item);
+extern int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *bucket, int item);
 
 struct crush_bucket_uniform *
 crush_make_uniform_bucket(int hash, int type, int size,
@@ -36,7 +36,8 @@ crush_make_tree_bucket(int hash, int type, int size,
 		       int *items,    /* in leaf order */
 		       int *weights);
 struct crush_bucket_straw *
-crush_make_straw_bucket(int hash, int type, int size,
+crush_make_straw_bucket(struct crush_map *map,
+			int hash, int type, int size,
 			int *items,
 			int *weights);
 
diff --git a/src/crush/crush.h b/src/crush/crush.h
index 8bac92a..712d534 100644
--- a/src/crush/crush.h
+++ b/src/crush/crush.h
@@ -26,6 +26,8 @@
 #define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
 
 #define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_RULESET (1<<8) /*max crush ruleset number*/
+#define CRUSH_MAX_RULES	CRUSH_MAX_RULESET /*max crush rules, shold be the same as max rulesets*/
 
 #define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
 #define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
@@ -189,6 +191,12 @@ struct crush_map {
 	 * mappings line up a bit better with previous mappings. */
 	__u8 chooseleaf_vary_r;
 
+	/*
+	 * version 0 (original) of straw_calc has various flaws.  version 1
+	 * fixes a few of them.
+	 */
+	__u8 straw_calc_version;
+
 	__u32 *choose_tries;
 };
 
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
index 22cde51..327668f 100644
--- a/src/crush/mapper.c
+++ b/src/crush/mapper.c
@@ -291,6 +291,7 @@ static int is_out(const struct crush_map *map,
  * @type: the type of item to choose
  * @out: pointer to output vector
  * @outpos: our position in that vector
+ * @out_size: size of the out vector
  * @tries: number of attempts to make
  * @recurse_tries: number of attempts to have recursive chooseleaf make
  * @local_retries: localized retries
@@ -305,6 +306,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 			       const __u32 *weight, int weight_max,
 			       int x, int numrep, int type,
 			       int *out, int outpos,
+			       int out_size,
 			       unsigned int tries,
 			       unsigned int recurse_tries,
 			       unsigned int local_retries,
@@ -323,6 +325,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 	int item = 0;
 	int itemtype;
 	int collide, reject;
+	int count = out_size;
 
 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
 		recurse_to_leaf ? "_LEAF" : "",
@@ -330,7 +333,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 		tries, recurse_tries, local_retries, local_fallback_retries,
 		parent_r);
 
-	for (rep = outpos; rep < numrep; rep++) {
+	for (rep = outpos; rep < numrep && count > 0 ; rep++) {
 		/* keep trying until we get a non-out, non-colliding item */
 		ftotal = 0;
 		skip_rep = 0;
@@ -404,7 +407,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 							 map->buckets[-1-item],
 							 weight, weight_max,
 							 x, outpos+1, 0,
-							 out2, outpos,
+							 out2, outpos, count,
 							 recurse_tries, 0,
 							 local_retries,
 							 local_fallback_retries,
@@ -464,6 +467,7 @@ reject:
 		dprintk("CHOOSE got %d\n", item);
 		out[outpos] = item;
 		outpos++;
+		count--;
 
 		if (map->choose_tries && ftotal <= map->choose_total_tries)
 			map->choose_tries[ftotal]++;
@@ -686,6 +690,7 @@ int crush_do_rule(const struct crush_map *map,
 	__u32 step;
 	int i, j;
 	int numrep;
+	int out_size;
 	/*
 	 * the original choose_total_tries value was off by one (it
 	 * counted "retries" and not "tries").  add one.
@@ -793,6 +798,7 @@ int crush_do_rule(const struct crush_map *map,
 						x, numrep,
 						curstep->arg2,
 						o+osize, j,
+						result_max-osize,
 						choose_tries,
 						recurse_tries,
 						choose_local_retries,
@@ -802,11 +808,13 @@ int crush_do_rule(const struct crush_map *map,
 						c+osize,
 						0);
 				} else {
+					out_size = ((numrep < (result_max-osize)) ?
+                                                    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
 						map->buckets[-1-w[i]],
 						weight, weight_max,
-						x, numrep, numrep,
+						x, out_size, numrep,
 						curstep->arg2,
 						o+osize, j,
 						choose_tries,
@@ -815,7 +823,7 @@ int crush_do_rule(const struct crush_map *map,
 						recurse_to_leaf,
 						c+osize,
 						0);
-					osize += numrep;
+					osize += out_size;
 				}
 			}
 
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 6b2a5fb..c3dfcab 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -52,6 +52,7 @@
 #define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41)  /* overlap w/ tunables3 */
 #define CEPH_FEATURE_MSGR_KEEPALIVE2   (1ULL<<42)
 #define CEPH_FEATURE_OSD_POOLRESEND    (1ULL<<43)
+#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
 
 /*
  * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -124,6 +125,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
 	 CEPH_FEATURE_OSD_PRIMARY_AFFINITY |	\
 	 CEPH_FEATURE_MSGR_KEEPALIVE2 |	\
 	 CEPH_FEATURE_OSD_POOLRESEND |	\
+         CEPH_FEATURE_OSD_SET_ALLOC_HINT |   \
 	 0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 10b52a5..d16df62 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -505,8 +505,10 @@ struct ceph_mds_reply_dirfrag {
 	__le32 dist[];
 } __attribute__ ((packed));
 
-#define CEPH_LOCK_FCNTL    1
-#define CEPH_LOCK_FLOCK    2
+#define CEPH_LOCK_FCNTL		1
+#define CEPH_LOCK_FLOCK		2
+#define CEPH_LOCK_FCNTL_INTR	3
+#define CEPH_LOCK_FLOCK_INTR	4
 
 #define CEPH_LOCK_SHARED   1
 #define CEPH_LOCK_EXCL     2
diff --git a/src/include/util.h b/src/include/util.h
index b30132e..4e4476a 100644
--- a/src/include/util.h
+++ b/src/include/util.h
@@ -4,17 +4,41 @@
  * Ceph - scalable distributed file system
  *
  * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
  *
  * This is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License version 2.1, as published by the Free Software 
  * Foundation.  See file COPYING.
  */
+#ifndef CEPH_UTIL_H
+#define CEPH_UTIL_H
 
 // is buf~len completely zero (in 8-byte chunks)
 
+#include "common/Formatter.h"
 #include "include/types.h"
 
 bool buf_is_zero(const char *buf, size_t len);
 
 int64_t unit_to_bytesize(string val, ostream *pss);
+
+struct ceph_data_stats
+{
+  uint64_t byte_total;
+  uint64_t byte_used;
+  uint64_t byte_avail;
+  int avail_percent;
+
+  void dump(Formatter *f) const {
+    assert(f != NULL);
+    f->dump_int("total", byte_total);
+    f->dump_int("used", byte_used);
+    f->dump_int("avail", byte_avail);
+    f->dump_int("avail_percent", avail_percent);
+  }
+};
+typedef struct ceph_data_stats ceph_data_stats_t;
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path);
+#endif /* CEPH_UTIL_H */
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
index dd3dbb0..2486539 100644
--- a/src/init-radosgw.sysv
+++ b/src/init-radosgw.sysv
@@ -85,10 +85,10 @@ case "$1" in
             fi
 
             if [ $SYSTEMD -eq 1 ]; then
-                systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
+                systemd-run -r sudo -u "$user" bash -c "ulimit -n 32768; $RADOSGW -n $name"
             else
-                #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
-                daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
+		ulimit -n 32768
+                daemon --user="$user" "$RADOSGW -n $name"
             fi
             echo "Starting $name..."
         done
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index aaccefe..e28cd6a 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -93,6 +93,10 @@ namespace librbd {
 
     void init_time(ImageCtx *i, aio_type_t t) {
       ictx = i;
+      {
+        Mutex::Locker l(ictx->aio_lock);
+        ++ictx->pending_aio;
+      }
       aio_type = t;
       start_time = ceph_clock_now(ictx->cct);
     }
@@ -114,6 +118,14 @@ namespace librbd {
 	lderr(ictx->cct) << "completed invalid aio_type: " << aio_type << dendl;
 	break;
       }
+
+      {
+        Mutex::Locker l(ictx->aio_lock);
+        assert(ictx->pending_aio != 0);
+        --ictx->pending_aio;
+        ictx->pending_aio_cond.Signal();
+      }
+
       if (complete_cb) {
 	complete_cb(rbd_comp, complete_arg);
       }
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index b5c2db6..8fb8e37 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -45,13 +45,15 @@ namespace librbd {
       snap_lock("librbd::ImageCtx::snap_lock"),
       parent_lock("librbd::ImageCtx::parent_lock"),
       refresh_lock("librbd::ImageCtx::refresh_lock"),
+      aio_lock("librbd::ImageCtx::aio_lock"),
       extra_read_flags(0),
       old_format(true),
       order(0), size(0), features(0),
       format_string(NULL),
       id(image_id), parent(NULL),
       stripe_unit(0), stripe_count(0),
-      object_cacher(NULL), writeback_handler(NULL), object_set(NULL)
+      object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
+      pending_aio(0)
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
@@ -586,6 +588,7 @@ namespace librbd {
     int r = flush_cache();
     if (r)
       lderr(cct) << "flush_cache returned " << r << dendl;
+    wait_for_pending_aio();
     cache_lock.Lock();
     bool unclean = object_cacher->release_set(object_set);
     cache_lock.Unlock();
@@ -655,5 +658,12 @@ namespace librbd {
 		   << ", object overlap " << len
 		   << " from image extents " << objectx << dendl;
     return len;
- }
+  }
+
+  void ImageCtx::wait_for_pending_aio() {
+    Mutex::Locker l(aio_lock);
+    while (pending_aio > 0) {
+      pending_aio_cond.Wait(aio_lock);
+    }
+  }
 }
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 83ed044..5a0d637 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -10,6 +10,7 @@
 #include <string>
 #include <vector>
 
+#include "common/Cond.h"
 #include "common/Mutex.h"
 #include "common/RWLock.h"
 #include "common/snap_types.h"
@@ -59,7 +60,8 @@ namespace librbd {
 
     /**
      * Lock ordering:
-     * md_lock, cache_lock, snap_lock, parent_lock, refresh_lock
+     * md_lock, cache_lock, snap_lock, parent_lock, refresh_lock,
+     * aio_lock
      */
     RWLock md_lock; // protects access to the mutable image metadata that
                    // isn't guarded by other locks below
@@ -68,6 +70,7 @@ namespace librbd {
     RWLock snap_lock; // protects snapshot-related member variables:
     RWLock parent_lock; // protects parent_md and parent
     Mutex refresh_lock; // protects refresh_seq and last_refresh
+    Mutex aio_lock; // protects pending_aio and pending_aio_cond
 
     unsigned extra_read_flags;
 
@@ -89,6 +92,9 @@ namespace librbd {
     LibrbdWriteback *writeback_handler;
     ObjectCacher::ObjectSet *object_set;
 
+    Cond pending_aio_cond;
+    uint64_t pending_aio;
+
     /**
      * Either image_name or image_id must be set.
      * If id is not known, pass the empty std::string,
@@ -147,7 +153,7 @@ namespace librbd {
 			 librados::snap_t in_snap_id);
     uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
 				  uint64_t overlap);
-
+    void wait_for_pending_aio();
   };
 }
 
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index afa4660..598d515 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -419,7 +419,15 @@ namespace librbd {
     for (std::list<string>::const_iterator it = pools.begin();
 	 it != pools.end(); ++it) {
       IoCtx ioctx;
-      rados.ioctx_create(it->c_str(), ioctx);
+      r = rados.ioctx_create(it->c_str(), ioctx);
+      if (r == -ENOENT) {
+        ldout(cct, 1) << "pool " << *it << " no longer exists" << dendl;
+        continue;
+      } else if (r < 0) {
+        lderr(cct) << "Error accessing child image pool " << *it << dendl;
+        return r;
+      }
+
       set<string> image_ids;
       int r = cls_client::get_children(&ioctx, RBD_CHILDREN,
 				       parent_spec, image_ids);
@@ -633,32 +641,46 @@ namespace librbd {
     parent_spec pspec(ictx->md_ctx.get_id(), ictx->id, snap_id);
     // search all pools for children depending on this snapshot
     Rados rados(ictx->md_ctx);
-    std::list<std::string> pools;
-    rados.pool_list(pools);
-    std::set<std::string> children;
-    for (std::list<std::string>::const_iterator it = pools.begin(); it != pools.end(); ++it) {
-      IoCtx pool_ioctx;
-      r = rados.ioctx_create(it->c_str(), pool_ioctx);
-      if (r < 0) {
-	lderr(ictx->cct) << "snap_unprotect: can't create ioctx for pool "
-			 << *it << dendl;
-	goto reprotect_and_return_err;
-      }
-      r = cls_client::get_children(&pool_ioctx, RBD_CHILDREN, pspec, children);
-      // key should not exist for this parent if there is no entry
-      if (((r < 0) && (r != -ENOENT))) {
-	lderr(ictx->cct) << "can't get children for pool " << *it << dendl;
-	goto reprotect_and_return_err;
-      }
-      // if we found a child, can't unprotect
-      if (r == 0) {
-	lderr(ictx->cct) << "snap_unprotect: can't unprotect; at least " 
-	  << children.size() << " child(ren) in pool " << it->c_str() << dendl;
-	r = -EBUSY;
-	goto reprotect_and_return_err;
+
+    // protect against pools being renamed/deleted
+    bool retry_pool_check;
+    do {
+      retry_pool_check = false;
+
+      std::list<std::string> pools;
+      rados.pool_list(pools);
+      for (std::list<std::string>::const_iterator it = pools.begin(); it != pools.end(); ++it) {
+	IoCtx pool_ioctx;
+	r = rados.ioctx_create(it->c_str(), pool_ioctx);
+        if (r == -ENOENT) {
+          ldout(ictx->cct, 1) << "pool " << *it << " no longer exists" << dendl;
+          retry_pool_check = true;
+          break;
+        } else if (r < 0) {
+          lderr(ictx->cct) << "snap_unprotect: can't create ioctx for pool "
+			   << *it << dendl;
+          goto reprotect_and_return_err;
+        }
+
+	std::set<std::string> children;
+	r = cls_client::get_children(&pool_ioctx, RBD_CHILDREN, pspec, children);
+	// key should not exist for this parent if there is no entry
+	if (((r < 0) && (r != -ENOENT))) {
+	  lderr(ictx->cct) << "can't get children for pool " << *it << dendl;
+	  goto reprotect_and_return_err;
+	}
+	// if we found a child, can't unprotect
+	if (r == 0) {
+	  lderr(ictx->cct) << "snap_unprotect: can't unprotect; at least "
+			   << children.size() << " child(ren) in pool "
+			   << it->c_str() << dendl;
+	  r = -EBUSY;
+	  goto reprotect_and_return_err;
+	}
+	pool_ioctx.close();	// last one out will self-destruct
       }
-      pool_ioctx.close();	// last one out will self-destruct
-    }
+    } while(retry_pool_check);
+
     // didn't find any child in any pool, go ahead with unprotect
     r = cls_client::set_protection_status(&ictx->md_ctx,
 					  ictx->header_oid,
@@ -1260,7 +1282,6 @@ reprotect_and_return_err:
     if (r < 0) {
       lderr(ictx->cct) << "error opening parent image: " << cpp_strerror(r)
 		       << dendl;
-      close_image(ictx->parent);
       ictx->parent = NULL;
       return r;
     }
@@ -2118,10 +2139,12 @@ reprotect_and_return_err:
   void close_image(ImageCtx *ictx)
   {
     ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
-    if (ictx->object_cacher)
+    if (ictx->object_cacher) {
       ictx->shutdown_cache(); // implicitly flushes
-    else
+    } else {
       flush(ictx);
+      ictx->wait_for_pending_aio();
+    }
 
     if (ictx->parent) {
       close_image(ictx->parent);
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index cb1add3..dce2dfe 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -442,6 +442,7 @@ private:
     parent(0),
     inode_auth(CDIR_AUTH_DEFAULT),
     replica_caps_wanted(0),
+    fcntl_locks(g_ceph_context), flock_locks(g_ceph_context),
     item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
     item_dirty_dirfrag_dir(this), 
     item_dirty_dirfrag_nest(this), 
diff --git a/src/mds/Dumper.cc b/src/mds/Dumper.cc
index f7f18c9..a1b393e 100644
--- a/src/mds/Dumper.cc
+++ b/src/mds/Dumper.cc
@@ -160,7 +160,7 @@ void Dumper::undump(const char *dump_file)
   inodeno_t ino = MDS_INO_LOG_OFFSET + rank;
 
   Journaler::Header h;
-  h.trimmed_pos = start;
+  h.trimmed_pos = start - (start % g_default_file_layout.fl_object_size);
   h.expire_pos = start;
   h.write_pos = start+len;
   h.magic = CEPH_FS_ONDISK_MAGIC;
@@ -175,18 +175,14 @@ void Dumper::undump(const char *dump_file)
   object_locator_t oloc(mdsmap->get_metadata_pool());
   SnapContext snapc;
 
-  bool done = false;
-  Cond cond;
-  
   cout << "writing header " << oid << std::endl;
+  C_SaferCond header_cond;
+  lock.Lock();
   objecter->write_full(oid, oloc, snapc, hbl, ceph_clock_now(g_ceph_context), 0, 
 		       NULL, 
-		       new C_SafeCond(&lock, &cond, &done));
-
-  lock.Lock();
-  while (!done)
-    cond.Wait(lock);
+		       &header_cond);
   lock.Unlock();
+  header_cond.wait();
   
   // read
   Filer filer(objecter);
@@ -198,13 +194,12 @@ void Dumper::undump(const char *dump_file)
     uint64_t l = MIN(left, 1024*1024);
     j.read_fd(fd, l);
     cout << " writing " << pos << "~" << l << std::endl;
-    filer.write(ino, &h.layout, snapc, pos, l, j, ceph_clock_now(g_ceph_context), 0, NULL, new C_SafeCond(&lock, &cond, &done));
-
+    C_SaferCond body_cond;
     lock.Lock();
-    while (!done)
-      cond.Wait(lock);
+    filer.write(ino, &h.layout, snapc, pos, l, j, ceph_clock_now(g_ceph_context), 0, NULL, &body_cond);
     lock.Unlock();
-    
+    body_cond.wait();
+
     pos += l;
     left -= l;
   }
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 74305b9..19907b3 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2689,6 +2689,9 @@ static uint64_t calc_bounding(uint64_t t)
   return t + 1;
 }
 
+/**
+ * m and ack might be NULL, so don't dereference them unless dirty != 0
+ */
 void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, MClientCaps *m, MClientCaps *ack)
 {
   dout(10) << "_do_snap_update dirty " << ccap_string(dirty)
@@ -2766,14 +2769,22 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll
 							   client, NULL, ack));
 }
 
-
+/**
+ * m might be NULL, so don't dereference it unless dirty != 0.
+ */
 void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi)
 {
+
+  if (dirty && m->get_ctime() > pi->ctime) {
+    dout(7) << "  ctime " << pi->ctime << " -> " << m->get_ctime()
+	    << " for " << *in << dendl;
+    pi->ctime = m->get_ctime();
+  }
+
   // file
   if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
     utime_t atime = m->get_atime();
     utime_t mtime = m->get_mtime();
-    utime_t ctime = m->get_ctime();
     uint64_t size = m->get_size();
     version_t inline_version = m->inline_version;
     
@@ -2783,11 +2794,6 @@ void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *
 	      << " for " << *in << dendl;
       pi->mtime = mtime;
     }
-    if (ctime > pi->ctime) {
-      dout(7) << "  ctime " << pi->ctime << " -> " << ctime
-	      << " for " << *in << dendl;
-      pi->ctime = ctime;
-    }
     if (in->inode.is_file() &&   // ONLY if regular file
 	size > pi->size) {
       dout(7) << "  size " << pi->size << " -> " << size
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index d6cfebd..6c52fbd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -369,6 +369,8 @@ void MDCache::create_mydir_hierarchy(C_Gather *gather)
   CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
   adjust_subtree_auth(mydir, mds->whoami);   
 
+  LogSegment *ls = mds->mdlog->get_current_segment();
+
   // stray dir
   for (int i = 0; i < NUM_STRAY; ++i) {
     CInode *stray = create_system_inode(MDS_INO_STRAY(mds->whoami, i), S_IFDIR);
@@ -384,8 +386,10 @@ void MDCache::create_mydir_hierarchy(C_Gather *gather)
     mydir->fnode.fragstat.nsubdirs++;
     // save them
     straydir->mark_complete();
-    straydir->mark_dirty(straydir->pre_dirty(), mds->mdlog->get_current_segment());
+    straydir->mark_dirty(straydir->pre_dirty(), ls);
     straydir->commit(0, gather->new_sub());
+    stray->_mark_dirty_parent(ls, true);
+    stray->store_backtrace(gather->new_sub());
   }
 
   CInode *journal = create_system_inode(MDS_INO_LOG_OFFSET + mds->whoami, S_IFREG);
@@ -405,7 +409,7 @@ void MDCache::create_mydir_hierarchy(C_Gather *gather)
 
 
   mydir->mark_complete();
-  mydir->mark_dirty(mydir->pre_dirty(), mds->mdlog->get_current_segment());
+  mydir->mark_dirty(mydir->pre_dirty(), ls);
   mydir->commit(0, gather->new_sub());
 
   myin->store(gather->new_sub());
diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 4ee3500..dd2a2f3 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -4,7 +4,6 @@ libmds_la_SOURCES = \
 	mds/Dumper.cc \
 	mds/Resetter.cc \
 	mds/MDS.cc \
-	mds/flock.cc \
 	mds/locks.c \
 	mds/journal.cc \
 	mds/Server.cc \
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 64004b2..80a0fba 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -3085,20 +3085,28 @@ void Server::handle_client_file_setlock(MDRequestRef& mdr)
   dout(0) << "handle_client_file_setlock: " << set_lock << dendl;
 
   ceph_lock_state_t *lock_state = NULL;
+  bool interrupt = false;
 
   // get the appropriate lock state
   switch (req->head.args.filelock_change.rule) {
+  case CEPH_LOCK_FLOCK_INTR:
+    interrupt = true;
+    // fall-thru
   case CEPH_LOCK_FLOCK:
     lock_state = &cur->flock_locks;
     break;
 
+  case CEPH_LOCK_FCNTL_INTR:
+    interrupt = true;
+    // fall-thru
   case CEPH_LOCK_FCNTL:
     lock_state = &cur->fcntl_locks;
     break;
 
   default:
-    dout(0) << "got unknown lock type " << set_lock.type
-	    << ", dropping request!" << dendl;
+    dout(10) << "got unknown lock type " << set_lock.type
+	     << ", dropping request!" << dendl;
+    reply_request(mdr, -EOPNOTSUPP);
     return;
   }
 
@@ -3109,16 +3117,15 @@ void Server::handle_client_file_setlock(MDRequestRef& mdr)
     if (lock_state->is_waiting(set_lock)) {
       dout(10) << " unlock removing waiting lock " << set_lock << dendl;
       lock_state->remove_waiting(set_lock);
-    } else {
+      cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+    } else if (!interrupt) {
       dout(10) << " unlock attempt on " << set_lock << dendl;
       lock_state->remove_lock(set_lock, activated_locks);
       cur->take_waiting(CInode::WAIT_FLOCK, waiters);
     }
-    reply_request(mdr, 0);
-    /* For now we're ignoring the activated locks because their responses
-     * will be sent when the lock comes up again in rotation by the MDS.
-     * It's a cheap hack, but it's easy to code. */
     mds->queue_waiters(waiters);
+
+    reply_request(mdr, 0);
   } else {
     dout(10) << " lock attempt on " << set_lock << dendl;
     if (mdr->more()->flock_was_waiting &&
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
index 4e825c9..2849c1c 100644
--- a/src/mds/flock.cc
+++ b/src/mds/flock.cc
@@ -44,33 +44,33 @@ void ceph_lock_state_t::remove_waiting(ceph_filelock& fl)
 bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
                                  bool wait_on_fail, bool replay)
 {
-  dout(15) << "add_lock " << new_lock << dendl;
+  ldout(cct,15) << "add_lock " << new_lock << dendl;
   bool ret = false;
   list<multimap<uint64_t, ceph_filelock>::iterator>
     overlapping_locks, self_overlapping_locks, neighbor_locks;
 
   // first, get any overlapping locks and split them into owned-by-us and not
   if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) {
-    dout(15) << "got overlapping lock, splitting by owner" << dendl;
+    ldout(cct,15) << "got overlapping lock, splitting by owner" << dendl;
     split_by_owner(new_lock, overlapping_locks, self_overlapping_locks);
   }
   if (!overlapping_locks.empty()) { //overlapping locks owned by others :(
     if (CEPH_LOCK_EXCL == new_lock.type) {
       //can't set, we want an exclusive
-      dout(15) << "overlapping lock, and this lock is exclusive, can't set"
+      ldout(cct,15) << "overlapping lock, and this lock is exclusive, can't set"
               << dendl;
       if (wait_on_fail && !replay) {
         waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
       }
     } else { //shared lock, check for any exclusive locks blocking us
       if (contains_exclusive_lock(overlapping_locks)) { //blocked :(
-        dout(15) << " blocked by exclusive lock in overlapping_locks" << dendl;
+        ldout(cct,15) << " blocked by exclusive lock in overlapping_locks" << dendl;
         if (wait_on_fail && !replay) {
           waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
         }
       } else {
         //yay, we can insert a shared lock
-        dout(15) << "inserting shared lock" << dendl;
+        ldout(cct,15) << "inserting shared lock" << dendl;
         remove_waiting(new_lock);
         adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
         held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
@@ -80,7 +80,7 @@ bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
   } else { //no overlapping locks except our own
     remove_waiting(new_lock);
     adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
-    dout(15) << "no conflicts, inserting " << new_lock << dendl;
+    ldout(cct,15) << "no conflicts, inserting " << new_lock << dendl;
     held_locks.insert(pair<uint64_t, ceph_filelock>
                       (new_lock.start, new_lock));
     ret = true;
@@ -123,9 +123,9 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
   list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
     self_overlapping_locks;
   if (get_overlapping_locks(removal_lock, overlapping_locks)) {
-    dout(15) << "splitting by owner" << dendl;
+    ldout(cct,15) << "splitting by owner" << dendl;
     split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks);
-  } else dout(15) << "attempt to remove lock at " << removal_lock.start
+  } else ldout(cct,15) << "attempt to remove lock at " << removal_lock.start
                  << " but no locks there!" << dendl;
   bool remove_to_end = (0 == removal_lock.length);
   uint64_t removal_start = removal_lock.start;
@@ -134,13 +134,13 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
   __s64 old_lock_client = 0;
   ceph_filelock *old_lock;
 
-  dout(15) << "examining " << self_overlapping_locks.size()
+  ldout(cct,15) << "examining " << self_overlapping_locks.size()
           << " self-overlapping locks for removal" << dendl;
   for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
          iter = self_overlapping_locks.begin();
        iter != self_overlapping_locks.end();
        ++iter) {
-    dout(15) << "self overlapping lock " << (*iter)->second << dendl;
+    ldout(cct,15) << "self overlapping lock " << (*iter)->second << dendl;
     old_lock = &(*iter)->second;
     bool old_lock_to_end = (0 == old_lock->length);
     old_lock_end = old_lock->start + old_lock->length - 1;
@@ -149,7 +149,7 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
       if (old_lock->start < removal_start) {
         old_lock->length = removal_start - old_lock->start;
       } else {
-        dout(15) << "erasing " << (*iter)->second << dendl;
+        ldout(cct,15) << "erasing " << (*iter)->second << dendl;
         held_locks.erase(*iter);
         --client_held_lock_counts[old_lock_client];
       }
@@ -160,7 +160,7 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
                         (append_lock.start, append_lock));
       ++client_held_lock_counts[(client_t)old_lock->client];
       if (old_lock->start >= removal_start) {
-        dout(15) << "erasing " << (*iter)->second << dendl;
+        ldout(cct,15) << "erasing " << (*iter)->second << dendl;
         held_locks.erase(*iter);
         --client_held_lock_counts[old_lock_client];
       } else old_lock->length = removal_start - old_lock->start;
@@ -176,7 +176,7 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
       if (old_lock->start < removal_start) {
         old_lock->length = removal_start - old_lock->start;
       } else {
-        dout(15) << "erasing " << (*iter)->second << dendl;
+        ldout(cct,15) << "erasing " << (*iter)->second << dendl;
         held_locks.erase(*iter);
         --client_held_lock_counts[old_lock_client];
       }
@@ -207,7 +207,7 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
                   list<multimap<uint64_t, ceph_filelock>::iterator>
                   neighbor_locks)
 {
-  dout(15) << "adjust_locks" << dendl;
+  ldout(cct,15) << "adjust_locks" << dendl;
   bool new_lock_to_end = (0 == new_lock.length);
   uint64_t new_lock_start = new_lock.start;
   uint64_t new_lock_end = new_lock.start + new_lock.length - 1;
@@ -219,7 +219,7 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
        iter != old_locks.end();
        ++iter) {
     old_lock = &(*iter)->second;
-    dout(15) << "adjusting lock: " << *old_lock << dendl;
+    ldout(cct,15) << "adjusting lock: " << *old_lock << dendl;
     bool old_lock_to_end = (0 == old_lock->length);
     old_lock_start = old_lock->start;
     old_lock_end = old_lock->start + old_lock->length - 1;
@@ -228,17 +228,17 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
     old_lock_client = old_lock->client;
     if (new_lock_to_end || old_lock_to_end) {
       //special code path to deal with a length set at 0
-      dout(15) << "one lock extends forever" << dendl;
+      ldout(cct,15) << "one lock extends forever" << dendl;
       if (old_lock->type == new_lock.type) {
         //just unify them in new lock, remove old lock
-        dout(15) << "same lock type, unifying" << dendl;
+        ldout(cct,15) << "same lock type, unifying" << dendl;
         new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start :
           old_lock_start;
         new_lock.length = 0;
         held_locks.erase(*iter);
         --client_held_lock_counts[old_lock_client];
       } else { //not same type, have to keep any remains of old lock around
-        dout(15) << "shrinking old lock" << dendl;
+        ldout(cct,15) << "shrinking old lock" << dendl;
         if (new_lock_to_end) {
           if (old_lock_start < new_lock_start) {
             old_lock->length = new_lock_start - old_lock_start;
@@ -262,17 +262,17 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
       }
     } else {
       if (old_lock->type == new_lock.type) { //just merge them!
-        dout(15) << "merging locks, they're the same type" << dendl;
+        ldout(cct,15) << "merging locks, they're the same type" << dendl;
         new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start :
           new_lock_start;
         int new_end = (new_lock_end > old_lock_end) ? new_lock_end :
           old_lock_end;
         new_lock.length = new_end - new_lock.start + 1;
-        dout(15) << "erasing lock " << (*iter)->second << dendl;
+        ldout(cct,15) << "erasing lock " << (*iter)->second << dendl;
         held_locks.erase(*iter);
         --client_held_lock_counts[old_lock_client];
       } else { //we'll have to update sizes and maybe make new locks
-        dout(15) << "locks aren't same type, changing sizes" << dendl;
+        ldout(cct,15) << "locks aren't same type, changing sizes" << dendl;
         if (old_lock_end > new_lock_end) { //add extra lock after new_lock
           ceph_filelock appended_lock = *old_lock;
           appended_lock.start = new_lock_end + 1;
@@ -302,7 +302,7 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
        ++iter) {
     old_lock = &(*iter)->second;
     old_lock_client = old_lock->client;
-    dout(15) << "lock to coalesce: " << *old_lock << dendl;
+    ldout(cct,15) << "lock to coalesce: " << *old_lock << dendl;
     /* because if it's a neighboring lock there can't be any self-overlapping
        locks that covered it */
     if (old_lock->type == new_lock.type) { //merge them
@@ -354,8 +354,8 @@ ceph_lock_state_t::get_lower_bound(uint64_t start,
        && (start != 0)
        && (lower_bound != lock_map.begin())) --lower_bound;
    if (lock_map.end() == lower_bound)
-     dout(15) << "get_lower_dout(15)eturning end()" << dendl;
-   else dout(15) << "get_lower_bound returning iterator pointing to "
+     ldout(cct,15) << "get_lower_dout(15)eturning end()" << dendl;
+   else ldout(cct,15) << "get_lower_bound returning iterator pointing to "
                 << lower_bound->second << dendl;
    return lower_bound;
  }
@@ -368,8 +368,8 @@ ceph_lock_state_t::get_last_before(uint64_t end,
     lock_map.upper_bound(end);
   if (last != lock_map.begin()) --last;
   if (lock_map.end() == last)
-    dout(15) << "get_last_before returning end()" << dendl;
-  else dout(15) << "get_last_before returning iterator pointing to "
+    ldout(cct,15) << "get_last_before returning end()" << dendl;
+  else ldout(cct,15) << "get_last_before returning iterator pointing to "
                << last->second << dendl;
   return last;
 }
@@ -382,7 +382,7 @@ bool ceph_lock_state_t::share_space(
               ((iter->first < start) &&
                (((iter->first + iter->second.length - 1) >= start) ||
                 (0 == iter->second.length))));
-  dout(15) << "share_space got start: " << start << ", end: " << end
+  ldout(cct,15) << "share_space got start: " << start << ", end: " << end
           << ", lock: " << iter->second << ", returning " << ret << dendl;
   return ret;
 }
@@ -393,7 +393,7 @@ bool ceph_lock_state_t::get_overlapping_locks(ceph_filelock& lock,
                            list<multimap<uint64_t,
                                ceph_filelock>::iterator> *self_neighbors)
 {
-  dout(15) << "get_overlapping_locks" << dendl;
+  ldout(cct,15) << "get_overlapping_locks" << dendl;
   // create a lock starting one earlier and ending one later
   // to check for neighbors
   ceph_filelock neighbor_check_lock = lock;
@@ -419,8 +419,7 @@ bool ceph_lock_state_t::get_overlapping_locks(ceph_filelock& lock,
     if (share_space(iter, lock)) {
       overlaps.push_front(iter);
     } else if (self_neighbors &&
-               (neighbor_check_lock.client == iter->second.client) &&
-               (neighbor_check_lock.pid == iter->second.pid) &&
+	       ceph_filelock_owner_equal(neighbor_check_lock, iter->second) &&
                share_space(iter, neighbor_check_lock)) {
       self_neighbors->push_front(iter);
     }
@@ -438,7 +437,7 @@ bool ceph_lock_state_t::get_waiting_overlaps(ceph_filelock& lock,
                                                ceph_filelock>::iterator>&
                                                overlaps)
 {
-  dout(15) << "get_waiting_overlaps" << dendl;
+  ldout(cct,15) << "get_waiting_overlaps" << dendl;
   multimap<uint64_t, ceph_filelock>::iterator iter =
     get_last_before(lock.start + lock.length - 1, waiting_locks);
   bool cont = iter != waiting_locks.end();
@@ -459,15 +458,15 @@ void ceph_lock_state_t::split_by_owner(ceph_filelock& owner,
 {
   list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
     iter = locks.begin();
-  dout(15) << "owner lock: " << owner << dendl;
+  ldout(cct,15) << "owner lock: " << owner << dendl;
   while (iter != locks.end()) {
-    dout(15) << "comparing to " << (*iter)->second << dendl;
+    ldout(cct,15) << "comparing to " << (*iter)->second << dendl;
     if (ceph_filelock_owner_equal((*iter)->second, owner)) {
-      dout(15) << "success, pushing to owned_locks" << dendl;
+      ldout(cct,15) << "success, pushing to owned_locks" << dendl;
       owned_locks.push_back(*iter);
       iter = locks.erase(iter);
     } else {
-      dout(15) << "failure, something not equal in this group "
+      ldout(cct,15) << "failure, something not equal in this group "
               << (*iter)->second.client << ":" << owner.client << ","
 	      << (*iter)->second.owner << ":" << owner.owner << ","
 	      << (*iter)->second.pid << ":" << owner.pid << dendl;
diff --git a/src/mds/flock.h b/src/mds/flock.h
index 4791b85..bf3980d 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -37,7 +37,9 @@ inline bool operator==(ceph_filelock& l, ceph_filelock& r) {
 }
 
 class ceph_lock_state_t {
+  CephContext *cct;
 public:
+  ceph_lock_state_t(CephContext *cct_) : cct(cct_) {}
   multimap<uint64_t, ceph_filelock> held_locks;    // current locks
   multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks
   // both of the above are keyed by starting offset
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 184cf70..73f1ba2 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -703,7 +703,8 @@ struct cap_reconnect_t {
   cap_reconnect_t() {
     memset(&capinfo, 0, sizeof(capinfo));
   }
-  cap_reconnect_t(uint64_t cap_id, inodeno_t pino, const string& p, int w, int i, inodeno_t sr) : 
+  cap_reconnect_t(uint64_t cap_id, inodeno_t pino, const string& p, int w, int i,
+		  inodeno_t sr, bufferlist& lb) :
     path(p) {
     capinfo.cap_id = cap_id;
     capinfo.wanted = w;
@@ -711,6 +712,7 @@ struct cap_reconnect_t {
     capinfo.snaprealm = sr;
     capinfo.pathbase = pino;
     capinfo.flock_len = 0;
+    flockbl.claim(lb);
   }
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
diff --git a/src/messages/MClientReconnect.h b/src/messages/MClientReconnect.h
index 4e2839c..1b072a3 100644
--- a/src/messages/MClientReconnect.h
+++ b/src/messages/MClientReconnect.h
@@ -40,9 +40,9 @@ public:
   }
 
   void add_cap(inodeno_t ino, uint64_t cap_id, inodeno_t pathbase, const string& path,
-	       int wanted, int issued,
-	       inodeno_t sr) {
-    caps[ino] = cap_reconnect_t(cap_id, pathbase, path, wanted, issued, sr);
+	       int wanted, int issued, inodeno_t sr, bufferlist& lb)
+  {
+    caps[ino] = cap_reconnect_t(cap_id, pathbase, path, wanted, issued, sr, lb);
   }
   void add_snaprealm(inodeno_t ino, snapid_t seq, inodeno_t parent) {
     ceph_mds_snaprealm_reconnect r;
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 6c6ed29..a2bbb1f 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -86,10 +86,10 @@ void DataHealthService::get_health(
 
     health_status_t health_status = HEALTH_OK;
     string health_detail;
-    if (stats.latest_avail_percent <= g_conf->mon_data_avail_crit) {
+    if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_crit) {
       health_status = HEALTH_ERR;
       health_detail = "low disk space, shutdown imminent";
-    } else if (stats.latest_avail_percent <= g_conf->mon_data_avail_warn) {
+    } else if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_warn) {
       health_status = HEALTH_WARN;
       health_detail = "low disk space";
     }
@@ -110,7 +110,7 @@ void DataHealthService::get_health(
       stringstream ss;
       ss << "mon." << mon_name << " " << health_detail;
       summary.push_back(make_pair(health_status, ss.str()));
-      ss << " -- " <<  stats.latest_avail_percent << "% avail";
+      ss << " -- " <<  stats.fs_stats.avail_percent << "% avail";
       if (detail)
 	detail->push_back(make_pair(health_status, ss.str()));
     }
@@ -151,23 +151,18 @@ int DataHealthService::update_store_stats(DataStats &ours)
 
 int DataHealthService::update_stats()
 {
-  struct statfs stbuf;
-  int err = ::statfs(g_conf->mon_data.c_str(), &stbuf);
-  if (err < 0) {
-    derr << __func__ << " statfs error: " << cpp_strerror(errno) << dendl;
-    return -errno;
-  }
-
   entity_inst_t our_inst = mon->messenger->get_myinst();
   DataStats& ours = stats[our_inst];
 
-  ours.kb_total = stbuf.f_blocks * stbuf.f_bsize / 1024;
-  ours.kb_used = (stbuf.f_blocks - stbuf.f_bfree) * stbuf.f_bsize / 1024;
-  ours.kb_avail = stbuf.f_bavail * stbuf.f_bsize / 1024;
-  ours.latest_avail_percent = (((float)ours.kb_avail/ours.kb_total)*100);
-  dout(0) << __func__ << " avail " << ours.latest_avail_percent << "%"
-          << " total " << ours.kb_total << " used " << ours.kb_used << " avail " << ours.kb_avail
-          << dendl;
+  int err = get_fs_stats(ours.fs_stats, g_conf->mon_data.c_str());
+  if (err < 0) {
+    derr << __func__ << " get_fs_stats error: " << cpp_strerror(err) << dendl;
+    return err;
+  }
+  dout(0) << __func__ << " avail " << ours.fs_stats.avail_percent << "%"
+          << " total " << prettybyte_t(ours.fs_stats.byte_total)
+          << ", used " << prettybyte_t(ours.fs_stats.byte_used)
+          << ", avail " << prettybyte_t(ours.fs_stats.byte_avail) << dendl;
   ours.last_update = ceph_clock_now(g_ceph_context);
 
   return update_store_stats(ours);
@@ -213,7 +208,7 @@ void DataHealthService::service_tick()
 
   DataStats &ours = stats[mon->messenger->get_myinst()];
 
-  if (ours.latest_avail_percent <= g_conf->mon_data_avail_crit) {
+  if (ours.fs_stats.avail_percent <= g_conf->mon_data_avail_crit) {
     derr << "reached critical levels of available space on local monitor storage"
          << " -- shutdown!" << dendl;
     force_shutdown();
@@ -224,12 +219,12 @@ void DataHealthService::service_tick()
   // consumed in-between reports to assess if it's worth to log this info,
   // otherwise we may very well contribute to the consumption of the
   // already low available disk space.
-  if (ours.latest_avail_percent <= g_conf->mon_data_avail_warn) {
-    if (ours.latest_avail_percent != last_warned_percent)
+  if (ours.fs_stats.avail_percent <= g_conf->mon_data_avail_warn) {
+    if (ours.fs_stats.avail_percent != last_warned_percent)
       mon->clog.warn()
 	<< "reached concerning levels of available space on local monitor storage"
-	<< " (" << ours.latest_avail_percent << "% free)\n";
-    last_warned_percent = ours.latest_avail_percent;
+	<< " (" << ours.fs_stats.avail_percent << "% free)\n";
+    last_warned_percent = ours.fs_stats.avail_percent;
   } else {
     last_warned_percent = 0;
   }
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index bd9dd2e..461b3f2 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -425,6 +425,9 @@ COMMAND("osd crush unlink " \
 	"name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]", \
 	"unlink <name> from crush map (everywhere, or just at <ancestor>)", \
 	"osd", "rw", "cli,rest")
+COMMAND("osd crush reweight-all",
+	"recalculate the weights for the tree to ensure they sum correctly",
+	"osd", "rw", "cli,rest")
 COMMAND("osd crush reweight " \
 	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " \
 	"name=weight,type=CephFloat,range=0.0", \
@@ -433,6 +436,15 @@ COMMAND("osd crush reweight " \
 COMMAND("osd crush tunables " \
 	"name=profile,type=CephChoices,strings=legacy|argonaut|bobtail|firefly|optimal|default", \
 	"set crush tunables values to <profile>", "osd", "rw", "cli,rest")
+COMMAND("osd crush set-tunable "				    \
+	"name=tunable,type=CephChoices,strings=straw_calc_version " \
+	"name=value,type=CephInt",
+	"set crush tunable <tunable> to <value>",
+	"osd", "rw", "cli,rest")
+COMMAND("osd crush get-tunable "			      \
+	"name=tunable,type=CephChoices,strings=straw_calc_version",
+	"get crush tunable <tunable>",
+	"osd", "rw", "cli,rest")
 COMMAND("osd crush show-tunables", \
 	"show current crush tunables", "osd", "r", "cli,rest")
 COMMAND("osd crush rule create-simple " \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index fd3a358..ad35e5e 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -428,6 +428,8 @@ int Monitor::preinit()
     cluster_logger = pcb.create_perf_counters();
   }
 
+  paxos->init_logger();
+
   // verify cluster_uuid
   {
     int r = check_fsid();
@@ -1406,6 +1408,8 @@ void Monitor::handle_probe(MMonProbe *m)
  */
 void Monitor::handle_probe_probe(MMonProbe *m)
 {
+  MMonProbe *r;
+
   dout(10) << "handle_probe_probe " << m->get_source_inst() << *m
 	   << " features " << m->get_connection()->get_features() << dendl;
   uint64_t missing = required_features & ~m->get_connection()->get_features();
@@ -1418,12 +1422,26 @@ void Monitor::handle_probe_probe(MMonProbe *m)
       m->required_features = required_features;
       messenger->send_message(r, m->get_connection());
     }
-    m->put();
-    return;
+    goto out;
+  }
+
+  if (!is_probing() && !is_synchronizing()) {
+    // If the probing mon is way ahead of us, we need to re-bootstrap.
+    // Normally we capture this case when we initially bootstrap, but
+    // it is possible we pass those checks (we overlap with
+    // quorum-to-be) but fail to join a quorum before it moves past
+    // us.  We need to be kicked back to bootstrap so we can
+    // synchonize, not keep calling elections.
+    if (paxos->get_version() + 1 < m->paxos_first_version) {
+      dout(1) << " peer " << m->get_source_addr() << " has first_committed "
+	      << "ahead of us, re-bootstrapping" << dendl;
+      bootstrap();
+      goto out;
+
+    }
   }
 
-  MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY,
-			       name, has_ever_joined);
+  r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined);
   r->name = name;
   r->quorum = quorum;
   monmap->encode(r->monmap_bl, m->get_connection()->get_features());
@@ -1438,6 +1456,7 @@ void Monitor::handle_probe_probe(MMonProbe *m)
     extra_probe_peers.insert(m->get_source_addr());
   }
 
+ out:
   m->put();
 }
 
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 88c4f93..1576db7 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -87,6 +87,9 @@ class MonitorDBStore
 
   struct Transaction {
     list<Op> ops;
+    uint64_t bytes, keys;
+
+    Transaction() : bytes(0), keys(0) {}
 
     enum {
       OP_PUT	= 1,
@@ -96,6 +99,8 @@ class MonitorDBStore
 
     void put(string prefix, string key, bufferlist& bl) {
       ops.push_back(Op(OP_PUT, prefix, key, bl));
+      ++keys;
+      bytes += prefix.length() + key.length() + bl.length();
     }
 
     void put(string prefix, version_t ver, bufferlist& bl) {
@@ -112,6 +117,8 @@ class MonitorDBStore
 
     void erase(string prefix, string key) {
       ops.push_back(Op(OP_ERASE, prefix, key));
+      ++keys;
+      bytes += prefix.length() + key.length();
     }
 
     void erase(string prefix, version_t ver) {
@@ -129,14 +136,20 @@ class MonitorDBStore
     }
 
     void encode(bufferlist& bl) const {
-      ENCODE_START(1, 1, bl);
+      ENCODE_START(2, 1, bl);
       ::encode(ops, bl);
+      ::encode(bytes, bl);
+      ::encode(keys, bl);
       ENCODE_FINISH(bl);
     }
 
     void decode(bufferlist::iterator& bl) {
-      DECODE_START(1, bl);
+      DECODE_START(2, bl);
       ::decode(ops, bl);
+      if (struct_v >= 2) {
+	::decode(bytes, bl);
+	::decode(keys, bl);
+      }
       DECODE_FINISH(bl);
     }
 
@@ -153,6 +166,8 @@ class MonitorDBStore
 
     void append(Transaction& other) {
       ops.splice(ops.end(), other.ops);
+      keys += other.keys;
+      bytes += other.bytes;
     }
 
     void append_from_encoded(bufferlist& bl) {
@@ -169,6 +184,12 @@ class MonitorDBStore
     bool size() {
       return ops.size();
     }
+    uint64_t get_keys() const {
+      return keys;
+    }
+    uint64_t get_bytes() const {
+      return bytes;
+    }
 
     void dump(ceph::Formatter *f, bool dump_val=false) const {
       f->open_object_section("transaction");
@@ -218,6 +239,8 @@ class MonitorDBStore
 	f->close_section();
       }
       f->close_section();
+      f->dump_unsigned("num_keys", keys);
+      f->dump_unsigned("num_bytes", bytes);
       f->close_section();
     }
   };
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 7e469b2..da06b86 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1211,7 +1211,7 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
       osdmap.get_info(from).up_from > m->version) {
     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
     send_latest(m, m->sb.current_epoch+1);
-    goto ignore;
+    return true;
   }
 
   // noup?
@@ -2465,6 +2465,31 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     }
     ss << "listed " << osdmap.blacklist.size() << " entries";
 
+  } else if (prefix == "osd crush get-tunable") {
+    string tunable;
+    cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+    int value;
+    cmd_getval(g_ceph_context, cmdmap, "value", value);
+    ostringstream rss;
+    if (f)
+      f->open_object_section("tunable");
+    if (tunable == "straw_calc_version") {
+      if (f)
+	f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
+      else
+	rss << osdmap.crush->get_straw_calc_version() << "\n";
+    } else {
+      r = -EINVAL;
+      goto reply;
+    }
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      rdata.append(rss.str());
+    }
+    r = 0;
+
   } else if (prefix == "osd pool get") {
     string poolstr;
     cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
@@ -3279,11 +3304,38 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
   if (*crush_ruleset < 0) {
     switch (pool_type) {
     case pg_pool_t::TYPE_REPLICATED:
-      *crush_ruleset = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
-      if (*crush_ruleset < 0) {
-        // Errors may happen e.g. if no valid ruleset is available
-        ss << "No suitable CRUSH ruleset exists";
-        return *crush_ruleset;
+      {
+	if (ruleset_name == "") {
+	  //Use default ruleset
+	  *crush_ruleset = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
+	  if (*crush_ruleset < 0) {
+	    // Errors may happen e.g. if no valid ruleset is available
+	    ss << "No suitable CRUSH ruleset exists";
+	    return *crush_ruleset;
+	  }
+	} else {
+	  int ret;
+	  ret = osdmap.crush->get_rule_id(ruleset_name);
+	  if (ret != -ENOENT) {
+	    // found it, use it
+	    *crush_ruleset = ret;
+	  } else {
+	    CrushWrapper newcrush;
+	    _get_pending_crush(newcrush);
+
+	    ret = newcrush.get_rule_id(ruleset_name);
+	    if (ret != -ENOENT) {
+	      // found it, wait for it to be proposed
+	      dout(20) << "prepare_pool_crush_ruleset: ruleset "
+		   << ruleset_name << " is pending, try again" << dendl;
+	      return -EAGAIN;
+	    } else {
+	      //Cannot find it , return error
+	      ss << "Specified ruleset " << ruleset_name << " doesn't exist";
+	      return ret;
+	    }
+	  }
+	}
       }
       break;
     case pg_pool_t::TYPE_ERASURE:
@@ -4115,6 +4167,19 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       }
     } while (false);
 
+  } else if (prefix == "osd crush reweight-all") {
+    // osd crush reweight <name> <weight>
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    newcrush.reweight(g_ceph_context);
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush);
+    ss << "reweighted crush hierarchy";
+    getline(ss, rs);
+    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+						  get_last_committed() + 1));
+    return true;
   } else if (prefix == "osd crush reweight") {
     do {
       // osd crush reweight <name> <weight>
@@ -4190,6 +4255,46 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
 					      get_last_committed() + 1));
     return true;
+  } else if (prefix == "osd crush set-tunable") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    err = 0;
+    string tunable;
+    cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+
+    int64_t value = -1;
+    if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
+      err = -EINVAL;
+      ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
+      goto reply;
+    }
+
+    if (tunable == "straw_calc_version") {
+      if (value < 0 || value > 2) {
+	ss << "value must be 0 or 1; got " << value;
+	err = -EINVAL;
+	goto reply;
+      }
+      newcrush.set_straw_calc_version(value);
+    } else {
+      ss << "unrecognized tunable '" << tunable << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    if (!validate_crush_against_features(&newcrush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush);
+    ss << "adjusted tunable " << tunable << " to " << value;
+    getline(ss, rs);
+    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+					      get_last_committed() + 1));
+    return true;
 
   } else if (prefix == "osd crush rule create-simple") {
     string name, root, type, mode;
@@ -5004,35 +5109,41 @@ done:
     cmd_getval(g_ceph_context, cmdmap, "ruleset", ruleset_name);
     string erasure_code_profile;
     cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
-    if (erasure_code_profile == "")
-      erasure_code_profile = "default";
-    if (erasure_code_profile == "default") {
-      if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
-	if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
-	  dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
-	  goto wait;
-	}
 
-	map<string,string> profile_map;
-	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
+    if (pool_type == pg_pool_t::TYPE_ERASURE) {
+      if (erasure_code_profile == "")
+	erasure_code_profile = "default";
+      //handle the erasure code profile
+      if (erasure_code_profile == "default") {
+	if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
+	  if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
+	    dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
+	    goto wait;
+	  }
+
+	  map<string,string> profile_map;
+	  err = osdmap.get_erasure_code_profile_default(g_ceph_context,
 						      profile_map,
 						      &ss);
-	if (err)
-	  goto reply;
-	dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
-	pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
-	goto wait;
+	  if (err)
+	    goto reply;
+	  dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
+	  pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
+	  goto wait;
+	}
       }
-    }
-
-    if (ruleset_name == "") {
-      if (erasure_code_profile == "default") {
-	ruleset_name = "erasure-code";
-      } else {
-	dout(1) << "implicitly use ruleset named after the pool: "
+      if (ruleset_name == "") {
+	if (erasure_code_profile == "default") {
+	  ruleset_name = "erasure-code";
+	} else {
+	  dout(1) << "implicitly use ruleset named after the pool: "
 		<< poolstr << dendl;
-	ruleset_name = poolstr;
+	  ruleset_name = poolstr;
+	}
       }
+    } else {
+      //NOTE:for replicated pool,cmd_map will put ruleset_name to erasure_code_profile field
+      ruleset_name = erasure_code_profile;
     }
 
     err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 5ec8ee2..59b6a03 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -379,17 +379,31 @@ void PGMap::update_pg(pg_t pgid, bufferlist& bl)
 {
   bufferlist::iterator p = bl.begin();
   ceph::unordered_map<pg_t,pg_stat_t>::iterator s = pg_stat.find(pgid);
-  if (s != pg_stat.end())
+  epoch_t old_lec = 0;
+  if (s != pg_stat.end()) {
+    old_lec = s->second.get_effective_last_epoch_clean();
     stat_pg_sub(pgid, s->second);
+  }
   pg_stat_t& r = pg_stat[pgid];
   ::decode(r, p);
   stat_pg_add(pgid, r);
+
+  epoch_t lec = r.get_effective_last_epoch_clean();
+  if (min_last_epoch_clean &&
+      (lec < min_last_epoch_clean ||  // we did
+       (lec > min_last_epoch_clean && // we might
+	old_lec == min_last_epoch_clean)
+       ))
+    min_last_epoch_clean = 0;
 }
 
 void PGMap::remove_pg(pg_t pgid)
 {
   ceph::unordered_map<pg_t,pg_stat_t>::iterator s = pg_stat.find(pgid);
   if (s != pg_stat.end()) {
+    if (min_last_epoch_clean &&
+	s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
+      min_last_epoch_clean = 0;
     stat_pg_sub(pgid, s->second);
     pg_stat.erase(s);
   }
@@ -399,14 +413,33 @@ void PGMap::update_osd(int osd, bufferlist& bl)
 {
   bufferlist::iterator p = bl.begin();
   ceph::unordered_map<int32_t,osd_stat_t>::iterator o = osd_stat.find(osd);
-  if (o != osd_stat.end())
+  epoch_t old_lec = 0;
+  if (o != osd_stat.end()) {
+    ceph::unordered_map<int32_t,epoch_t>::iterator i = osd_epochs.find(osd);
+    if (i != osd_epochs.end())
+      old_lec = i->second;
     stat_osd_sub(o->second);
+  }
   osd_stat_t& r = osd_stat[osd];
   ::decode(r, p);
   stat_osd_add(r);
 
   // adjust [near]full status
   register_nearfull_status(osd, r);
+
+  // epoch?
+  if (!p.end()) {
+    epoch_t e;
+    ::decode(e, p);
+
+    if (e < min_last_epoch_clean ||
+	(e > min_last_epoch_clean &&
+	 old_lec == min_last_epoch_clean))
+      min_last_epoch_clean = 0;
+  } else {
+    // WARNING: we are not refreshing min_last_epoch_clean!  must be old store
+    // or old mon running.
+  }
 }
 
 void PGMap::remove_osd(int osd)
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 15f6746..364ad20 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -545,6 +545,7 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t)
       ::encode(p->first, dirty);
       bufferlist bl;
       ::encode(p->second, bl, features);
+      ::encode(pending_inc.get_osd_epochs().find(p->first)->second, bl);
       t->put(prefix, stringify(p->first), bl);
     }
     for (set<int32_t>::const_iterator p =
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index b38b111..2e41eb8 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -73,6 +73,44 @@ void Paxos::init()
   assert(is_consistent());
 }
 
+void Paxos::init_logger()
+{
+  PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
+  pcb.add_u64_counter(l_paxos_start_leader, "start_leader");
+  pcb.add_u64_counter(l_paxos_start_peon, "start_peon");
+  pcb.add_u64_counter(l_paxos_restart, "restart");
+  pcb.add_u64_counter(l_paxos_refresh, "refresh");
+  pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency");
+  pcb.add_u64_counter(l_paxos_begin, "begin");
+  pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys");
+  pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes");
+  pcb.add_time_avg(l_paxos_begin_latency, "begin_latency");
+  pcb.add_u64_counter(l_paxos_commit, "commit");
+  pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys");
+  pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes");
+  pcb.add_time_avg(l_paxos_commit_latency, "commit_latency");
+  pcb.add_u64_counter(l_paxos_collect, "collect");
+  pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys");
+  pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes");
+  pcb.add_time_avg(l_paxos_collect_latency, "collect_latency");
+  pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted");
+  pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout");
+  pcb.add_u64_counter(l_paxos_accept_timeout, "accept_timeout");
+  pcb.add_u64_counter(l_paxos_lease_ack_timeout, "lease_ack_timeout");
+  pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout");
+  pcb.add_u64_counter(l_paxos_store_state, "store_state");
+  pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys");
+  pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes");
+  pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency");
+  pcb.add_u64_counter(l_paxos_share_state, "share_state");
+  pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys");
+  pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes");
+  pcb.add_u64_counter(l_paxos_new_pn, "new_pn");
+  pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency");
+  logger = pcb.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
 void Paxos::dump_info(Formatter *f)
 {
   f->open_object_section("paxos");
@@ -120,6 +158,8 @@ void Paxos::collect(version_t oldpn)
 	     << " pn " << uncommitted_pn
 	     << " (" << uncommitted_value.length() << " bytes) from myself" 
 	     << dendl;
+
+    logger->inc(l_paxos_collect_uncommitted);
   }
 
   // pick new pn
@@ -193,7 +233,15 @@ void Paxos::handle_collect(MMonPaxos *collect)
     f.flush(*_dout);
     *_dout << dendl;
 
+    logger->inc(l_paxos_collect);
+    logger->inc(l_paxos_collect_keys, t.get_keys());
+    logger->inc(l_paxos_collect_bytes, t.get_bytes());
+    utime_t start = ceph_clock_now(NULL);
+
     get_store()->apply_transaction(t);
+
+    utime_t end = ceph_clock_now(NULL);
+    logger->tinc(l_paxos_collect_latency, end - start);
   } else {
     // don't accept!
     dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from
@@ -229,6 +277,8 @@ void Paxos::handle_collect(MMonPaxos *collect)
 	       << " and crossing our fingers" << dendl;
       last->uncommitted_pn = previous_pn;
     }
+
+    logger->inc(l_paxos_collect_uncommitted);
   }
 
   // send reply
@@ -258,14 +308,19 @@ void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed,
   version_t v = peer_last_committed + 1;
 
   // include incrementals
+  uint64_t bytes = 0;
   for ( ; v <= last_committed; v++) {
     if (get_store()->exists(get_name(), v)) {
       get_store()->get(get_name(), v, m->values[v]);
       assert(m->values[v].length());
       dout(10) << " sharing " << v << " ("
 	       << m->values[v].length() << " bytes)" << dendl;
+      bytes += m->values[v].length() + 16;  // paxos_ + 10 digits = 16
     }
   }
+  logger->inc(l_paxos_share_state);
+  logger->inc(l_paxos_share_state_keys, m->values.size());
+  logger->inc(l_paxos_share_state_bytes, bytes);
 
   m->last_committed = last_committed;
 }
@@ -318,6 +373,7 @@ bool Paxos::store_state(MMonPaxos *m)
     dout(10) << "store_state [" << start->first << ".." 
 	     << last_committed << "]" << dendl;
     t.put(get_name(), "last_committed", last_committed);
+
     // we should apply the state here -- decode every single bufferlist in the
     // map and append the transactions to 't'.
     map<version_t,bufferlist>::iterator it;
@@ -345,8 +401,16 @@ bool Paxos::store_state(MMonPaxos *m)
     f.flush(*_dout);
     *_dout << dendl;
 
+    logger->inc(l_paxos_store_state);
+    logger->inc(l_paxos_store_state_bytes, t.get_bytes());
+    logger->inc(l_paxos_store_state_keys, t.get_keys());
+    utime_t start = ceph_clock_now(NULL);
+
     get_store()->apply_transaction(t);
 
+    utime_t end = ceph_clock_now(NULL);
+    logger->tinc(l_paxos_store_state_latency, end - start);
+
     // refresh first_committed; this txn may have trimmed.
     first_committed = get_store()->get(get_name(), "first_committed");
 
@@ -385,6 +449,7 @@ void Paxos::_sanity_check_store()
 void Paxos::handle_last(MMonPaxos *last)
 {
   bool need_refresh = false;
+  int from = last->get_source().num();
 
   dout(10) << "handle_last " << *last << dendl;
 
@@ -396,12 +461,13 @@ void Paxos::handle_last(MMonPaxos *last)
 
   // note peer's first_ and last_committed, in case we learn a new
   // commit and need to push it to them.
-  peer_first_committed[last->get_source().num()] = last->first_committed;
-  peer_last_committed[last->get_source().num()] = last->last_committed;
+  peer_first_committed[from] = last->first_committed;
+  peer_last_committed[from] = last->last_committed;
 
-  if (last->first_committed > last_committed+1) {
+  if (last->first_committed > last_committed + 1) {
     dout(5) << __func__
-            << " peon's lowest version is too high for our last committed"
+            << " mon." << from
+	    << " lowest version is too high for our last committed"
             << " (theirs: " << last->first_committed
             << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
     last->put();
@@ -416,6 +482,31 @@ void Paxos::handle_last(MMonPaxos *last)
 
   assert(g_conf->paxos_kill_at != 2);
 
+  // is everyone contiguous and up to date?
+  for (map<int,version_t>::iterator p = peer_last_committed.begin();
+       p != peer_last_committed.end();
+       ++p) {
+    if (p->second + 1 < first_committed && first_committed > 1) {
+      dout(5) << __func__
+	      << " peon " << p->first
+	      << " last_committed (" << p->second
+	      << ") is too low for our first_committed (" << first_committed
+	      << ") -- bootstrap!" << dendl;
+      last->put();
+      mon->bootstrap();
+      return;
+    }
+    if (p->second < last_committed) {
+      // share committed values
+      dout(10) << " sending commit to mon." << p->first << dendl;
+      MMonPaxos *commit = new MMonPaxos(mon->get_epoch(),
+					MMonPaxos::OP_COMMIT,
+					ceph_clock_now(g_ceph_context));
+      share_state(commit, peer_first_committed[p->first], p->second);
+      mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
+    }
+  }
+
   // do they accept your pn?
   if (last->pn > accepted_pn) {
     // no, try again.
@@ -457,21 +548,6 @@ void Paxos::handle_last(MMonPaxos *last)
       // cancel timeout event
       mon->timer.cancel_event(collect_timeout_event);
       collect_timeout_event = 0;
-
-      // share committed values?
-      for (map<int,version_t>::iterator p = peer_last_committed.begin();
-	   p != peer_last_committed.end();
-	   ++p) {
-	if (p->second < last_committed) {
-	  // share committed values
-	  dout(10) << " sending commit to mon." << p->first << dendl;
-	  MMonPaxos *commit = new MMonPaxos(mon->get_epoch(),
-					    MMonPaxos::OP_COMMIT,
-					    ceph_clock_now(g_ceph_context));
-	  share_state(commit, peer_first_committed[p->first], p->second);
-	  mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
-	}
-      }
       peer_first_committed.clear();
       peer_last_committed.clear();
 
@@ -513,6 +589,7 @@ void Paxos::collect_timeout()
 {
   dout(1) << "collect timeout, calling fresh election" << dendl;
   collect_timeout_event = 0;
+  logger->inc(l_paxos_collect_timeout);
   assert(mon->is_leader());
   mon->bootstrap();
 }
@@ -534,7 +611,7 @@ void Paxos::begin(bufferlist& v)
   
   // and no value, yet.
   assert(new_value.length() == 0);
-  
+
   // accept it ourselves
   accepted.clear();
   accepted.insert(mon->rank);
@@ -573,8 +650,16 @@ void Paxos::begin(bufferlist& v)
   f.flush(*_dout);
   *_dout << dendl;
 
+  logger->inc(l_paxos_begin);
+  logger->inc(l_paxos_begin_keys, t.get_keys());
+  logger->inc(l_paxos_begin_bytes, t.get_bytes());
+  utime_t start = ceph_clock_now(NULL);
+
   get_store()->apply_transaction(t);
 
+  utime_t end = ceph_clock_now(NULL);
+  logger->tinc(l_paxos_begin_latency, end - start);
+
   assert(g_conf->paxos_kill_at != 3);
 
   if (mon->get_quorum().size() == 1) {
@@ -629,6 +714,8 @@ void Paxos::handle_begin(MMonPaxos *begin)
   
   assert(g_conf->paxos_kill_at != 4);
 
+  logger->inc(l_paxos_begin);
+
   // set state.
   state = STATE_UPDATING;
   lease_expire = utime_t();  // cancel lease
@@ -651,8 +738,14 @@ void Paxos::handle_begin(MMonPaxos *begin)
   f.flush(*_dout);
   *_dout << dendl;
 
+  logger->inc(l_paxos_begin_bytes, t.get_bytes());
+  utime_t start = ceph_clock_now(NULL);
+
   get_store()->apply_transaction(t);
 
+  utime_t end = ceph_clock_now(NULL);
+  logger->tinc(l_paxos_begin_latency, end - start);
+
   assert(g_conf->paxos_kill_at != 5);
 
   // reply
@@ -733,6 +826,7 @@ void Paxos::accept_timeout()
   accept_timeout_event = 0;
   assert(mon->is_leader());
   assert(is_updating() || is_updating_previous());
+  logger->inc(l_paxos_accept_timeout);
   mon->bootstrap();
 }
 
@@ -764,8 +858,16 @@ void Paxos::commit()
   f.flush(*_dout);
   *_dout << dendl;
 
+  logger->inc(l_paxos_commit);
+  logger->inc(l_paxos_commit_keys, t.get_keys());
+  logger->inc(l_paxos_commit_bytes, t.get_bytes());
+  utime_t start = ceph_clock_now(NULL);
+
   get_store()->apply_transaction(t);
 
+  utime_t end = ceph_clock_now(NULL);
+  logger->tinc(l_paxos_commit_latency, end - start);
+
   assert(g_conf->paxos_kill_at != 8);
 
   // refresh first_committed; this txn may have trimmed.
@@ -802,6 +904,8 @@ void Paxos::handle_commit(MMonPaxos *commit)
 {
   dout(10) << "handle_commit on " << commit->last_committed << dendl;
 
+  logger->inc(l_paxos_commit);
+
   if (!mon->is_peon()) {
     dout(10) << "not a peon, dropping" << dendl;
     assert(0);
@@ -883,9 +987,15 @@ bool Paxos::do_refresh()
 {
   bool need_bootstrap = false;
 
+  utime_t start = ceph_clock_now(NULL);
+
   // make sure we have the latest state loaded up
   mon->refresh_from_paxos(&need_bootstrap);
 
+  utime_t end = ceph_clock_now(NULL);
+  logger->inc(l_paxos_refresh);
+  logger->tinc(l_paxos_refresh_latency, end - start);
+
   if (need_bootstrap) {
     dout(10) << " doing requested bootstrap" << dendl;
     mon->bootstrap();
@@ -1019,7 +1129,7 @@ void Paxos::lease_ack_timeout()
   dout(1) << "lease_ack_timeout -- calling new election" << dendl;
   assert(mon->is_leader());
   assert(is_active());
-
+  logger->inc(l_paxos_lease_ack_timeout);
   lease_ack_timeout_event = 0;
   mon->bootstrap();
 }
@@ -1037,7 +1147,7 @@ void Paxos::lease_timeout()
 {
   dout(1) << "lease_timeout -- calling new election" << dendl;
   assert(mon->is_peon());
-
+  logger->inc(l_paxos_lease_timeout);
   lease_timeout_event = 0;
   mon->bootstrap();
 }
@@ -1112,8 +1222,14 @@ version_t Paxos::get_new_proposal_number(version_t gt)
   f.flush(*_dout);
   *_dout << dendl;
 
+  logger->inc(l_paxos_new_pn);
+  utime_t start = ceph_clock_now(NULL);
+
   get_store()->apply_transaction(t);
 
+  utime_t end = ceph_clock_now(NULL);
+  logger->tinc(l_paxos_new_pn_latency, end - start);
+
   dout(10) << "get_new_proposal_number = " << last_pn << dendl;
   return last_pn;
 }
@@ -1150,6 +1266,9 @@ void Paxos::shutdown() {
   finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
   finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED);
   finish_contexts(g_ceph_context, proposals, -ECANCELED);
+  if (logger)
+    g_ceph_context->get_perfcounters_collection()->remove(logger);
+  delete logger;
 }
 
 void Paxos::leader_init()
@@ -1159,6 +1278,8 @@ void Paxos::leader_init()
 
   finish_contexts(g_ceph_context, proposals, -EAGAIN);
 
+  logger->inc(l_paxos_start_leader);
+
   if (mon->get_quorum().size() == 1) {
     state = STATE_ACTIVE;
     return;
@@ -1186,6 +1307,8 @@ void Paxos::peon_init()
   finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN);
   finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
   finish_contexts(g_ceph_context, proposals, -EAGAIN);
+
+  logger->inc(l_paxos_start_peon);
 }
 
 void Paxos::restart()
@@ -1199,6 +1322,8 @@ void Paxos::restart()
   finish_contexts(g_ceph_context, proposals, -EAGAIN);
   finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
   finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN);
+
+  logger->inc(l_paxos_restart);
 }
 
 
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index b9e43a1..b1ecedc 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -118,6 +118,7 @@ e 12v
 #include "include/Context.h"
 
 #include "common/Timer.h"
+#include "common/perf_counters.h"
 #include <errno.h>
 
 #include "MonitorDBStore.h"
@@ -126,6 +127,43 @@ class Monitor;
 class MMonPaxos;
 class Paxos;
 
+enum {
+  l_paxos_first = 45800,
+  l_paxos_start_leader,
+  l_paxos_start_peon,
+  l_paxos_restart,
+  l_paxos_refresh,
+  l_paxos_refresh_latency,
+  l_paxos_begin,
+  l_paxos_begin_keys,
+  l_paxos_begin_bytes,
+  l_paxos_begin_latency,
+  l_paxos_commit,
+  l_paxos_commit_keys,
+  l_paxos_commit_bytes,
+  l_paxos_commit_latency,
+  l_paxos_collect,
+  l_paxos_collect_keys,
+  l_paxos_collect_bytes,
+  l_paxos_collect_latency,
+  l_paxos_collect_uncommitted,
+  l_paxos_collect_timeout,
+  l_paxos_accept_timeout,
+  l_paxos_lease_ack_timeout,
+  l_paxos_lease_timeout,
+  l_paxos_store_state,
+  l_paxos_store_state_keys,
+  l_paxos_store_state_bytes,
+  l_paxos_store_state_latency,
+  l_paxos_share_state,
+  l_paxos_share_state_keys,
+  l_paxos_share_state_bytes,
+  l_paxos_new_pn,
+  l_paxos_new_pn_latency,
+  l_paxos_last,
+};
+
+
 // i am one state machine.
 /**
  * This libary is based on the Paxos algorithm, but varies in a few key ways:
@@ -147,6 +185,11 @@ class Paxos {
    */
   Monitor *mon;
 
+  /// perf counter for internal instrumentations
+  PerfCounters *logger;
+
+  void init_logger();
+
   // my state machine info
   const string paxos_name;
 
@@ -1004,6 +1047,7 @@ public:
    */
   Paxos(Monitor *m, const string &name) 
 		 : mon(m),
+		   logger(NULL),
 		   paxos_name(name),
 		   state(STATE_RECOVERING),
 		   first_committed(0),
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index 0ae1aaf..cc68ffb 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -16,6 +16,7 @@
 #define CEPH_MON_TYPES_H
 
 #include "include/utime.h"
+#include "include/util.h"
 #include "common/Formatter.h"
 
 #define PAXOS_PGMAP      0  // before osd, for pg kick to behave
@@ -89,44 +90,50 @@ WRITE_CLASS_ENCODER(LevelDBStoreStats);
 // data stats
 
 struct DataStats {
+  ceph_data_stats_t fs_stats;
   // data dir
-  uint64_t kb_total;
-  uint64_t kb_used;
-  uint64_t kb_avail;
-  int latest_avail_percent;
   utime_t last_update;
-
   LevelDBStoreStats store_stats;
 
   void dump(Formatter *f) const {
     assert(f != NULL);
-    f->dump_int("kb_total", kb_total);
-    f->dump_int("kb_used", kb_used);
-    f->dump_int("kb_avail", kb_avail);
-    f->dump_int("avail_percent", latest_avail_percent);
+    f->dump_int("kb_total", (fs_stats.byte_total/1024));
+    f->dump_int("kb_used", (fs_stats.byte_used/1024));
+    f->dump_int("kb_avail", (fs_stats.byte_avail/1024));
+    f->dump_int("avail_percent", fs_stats.avail_percent);
     f->dump_stream("last_updated") << last_update;
-
     f->open_object_section("store_stats");
     store_stats.dump(f);
     f->close_section();
   }
 
   void encode(bufferlist &bl) const {
-    ENCODE_START(2, 1, bl);
-    ::encode(kb_total, bl);
-    ::encode(kb_used, bl);
-    ::encode(kb_avail, bl);
-    ::encode(latest_avail_percent, bl);
+    ENCODE_START(3, 1, bl);
+    ::encode(fs_stats.byte_total, bl);
+    ::encode(fs_stats.byte_used, bl);
+    ::encode(fs_stats.byte_avail, bl);
+    ::encode(fs_stats.avail_percent, bl);
     ::encode(last_update, bl);
     ::encode(store_stats, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator &p) {
     DECODE_START(1, p);
-    ::decode(kb_total, p);
-    ::decode(kb_used, p);
-    ::decode(kb_avail, p);
-    ::decode(latest_avail_percent, p);
+    // we moved from having fields in kb to fields in byte
+    if (struct_v > 2) {
+      ::decode(fs_stats.byte_total, p);
+      ::decode(fs_stats.byte_used, p);
+      ::decode(fs_stats.byte_avail, p);
+    } else {
+      uint64_t t;
+      ::decode(t, p);
+      fs_stats.byte_total = t*1024;
+      ::decode(t, p);
+      fs_stats.byte_used = t*1024;
+      ::decode(t, p);
+      fs_stats.byte_avail = t*1024;
+    }
+    ::decode(fs_stats.avail_percent, p);
     ::decode(last_update, p);
     if (struct_v > 1)
       ::decode(store_stats, p);
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index 7eb7927..f3f244a 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -103,12 +103,14 @@ int FileJournal::_open(bool forwrite, bool create)
     goto out_fd;
 
 #ifdef HAVE_LIBAIO
-  aio_ctx = 0;
-  ret = io_setup(128, &aio_ctx);
-  if (ret < 0) {
-    ret = errno;
-    derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl;
-    goto out_fd;
+  if (aio) {
+    aio_ctx = 0;
+    ret = io_setup(128, &aio_ctx);
+    if (ret < 0) {
+      ret = errno;
+      derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl;
+      goto out_fd;
+    }
   }
 #endif
 
@@ -544,6 +546,7 @@ void FileJournal::close()
 
   // close
   assert(writeq_empty());
+  assert(!must_write_header);
   assert(fd >= 0);
   VOID_TEMP_FAILURE_RETRY(::close(fd));
   fd = -1;
@@ -564,9 +567,9 @@ int FileJournal::dump(ostream& out)
   JSONFormatter f(true);
 
   f.open_array_section("journal");
+  uint64_t seq = 0;
   while (1) {
     bufferlist bl;
-    uint64_t seq = 0;
     uint64_t pos = read_pos;
     if (!read_entry(bl, seq)) {
       dout(3) << "journal_replay: end of journal, done." << dendl;
@@ -604,7 +607,8 @@ void FileJournal::start_writer()
   write_stop = false;
   write_thread.create();
 #ifdef HAVE_LIBAIO
-  write_finish_thread.create();
+  if (aio)
+    write_finish_thread.create();
 #endif
 }
 
@@ -613,19 +617,25 @@ void FileJournal::stop_writer()
   {
     Mutex::Locker l(write_lock);
 #ifdef HAVE_LIBAIO
-    Mutex::Locker q(aio_lock);
+    if (aio)
+      aio_lock.Lock();
 #endif
     Mutex::Locker p(writeq_lock);
     write_stop = true;
     writeq_cond.Signal();
 #ifdef HAVE_LIBAIO
-    aio_cond.Signal();
-    write_finish_cond.Signal();
+    if (aio) {
+      aio_cond.Signal();
+      write_finish_cond.Signal();
+      aio_lock.Unlock();
+    }
 #endif
   } 
   write_thread.join();
 #ifdef HAVE_LIBAIO
-  write_finish_thread.join();
+  if (aio) {
+    write_finish_thread.join();
+  }
 #endif
 }
 
@@ -649,6 +659,13 @@ int FileJournal::read_header()
   buffer::ptr bp = buffer::create_page_aligned(block_size);
   bp.zero();
   int r = ::pread(fd, bp.c_str(), bp.length(), 0);
+
+  if (r < 0) {
+    int err = errno;
+    dout(0) << "read_header got " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+
   bl.push_back(bp);
 
   try {
@@ -660,11 +677,6 @@ int FileJournal::read_header()
     return -EINVAL;
   }
 
-  if (r < 0) {
-    int err = errno;
-    dout(0) << "read_header got " << cpp_strerror(err) << dendl;
-    return -err;
-  }
   
   /*
    * Unfortunately we weren't initializing the flags field for new
@@ -794,7 +806,8 @@ int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_
   }
 
   dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
-  //assert(write_pos + bl.length() == queue_pos);
+  assert((write_pos + bl.length() == queue_pos) ||
+         (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
   return 0;
 }
 
@@ -993,22 +1006,32 @@ void FileJournal::do_write(bufferlist& bl)
     dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
 	     << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
 
-    if (write_bl(pos, first)) {
-      derr << "FileJournal::do_write: write_bl(pos=" << pos
-	   << ") failed" << dendl;
-      ceph_abort();
-    }
-    assert(pos == get_top());
+    //Save pos to write first piece second
+    off64_t first_pos = pos;
+    off64_t orig_pos;
+    pos = get_top();
+    // header too?
     if (hbp.length()) {
       // be sneaky: include the header in the second fragment
       second.push_front(hbp);
       pos = 0;          // we included the header
     }
+    // Write the second portion first possible with the header, so
+    // do_read_entry() won't even get a valid entry_header_t if there
+    // is a crash between the two writes.
+    orig_pos = pos;
     if (write_bl(pos, second)) {
-      derr << "FileJournal::do_write: write_bl(pos=" << pos
+      derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+    orig_pos = first_pos;
+    if (write_bl(first_pos, first)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
 	   << ") failed" << dendl;
       ceph_abort();
     }
+    assert(first_pos == get_top());
   } else {
     // header too?
     if (hbp.length()) {
@@ -1102,7 +1125,7 @@ void FileJournal::write_thread_entry()
   while (1) {
     {
       Mutex::Locker locker(writeq_lock);
-      if (writeq.empty()) {
+      if (writeq.empty() && !must_write_header) {
 	if (write_stop)
 	  break;
 	dout(20) << "write_thread_entry going to sleep" << dendl;
@@ -1113,7 +1136,9 @@ void FileJournal::write_thread_entry()
     }
     
 #ifdef HAVE_LIBAIO
-    if (aio) {
+    //We hope write_finish_thread_entry return until the last aios complete
+    //when set write_stop. But it can't. So don't use aio mode when shutdown.
+    if (aio && !write_stop) {
       Mutex::Locker locker(aio_lock);
       // should we back off to limit aios in flight?  try to do this
       // adaptively so that we submit larger aios once we have lots of
@@ -1164,7 +1189,7 @@ void FileJournal::write_thread_entry()
     }
 
 #ifdef HAVE_LIBAIO
-    if (aio)
+    if (aio && !write_stop)
       do_aio_write(bl);
     else
       do_write(bl);
@@ -1353,7 +1378,7 @@ void FileJournal::write_finish_thread_entry()
 	aio_info *ai = (aio_info *)event[i].obj;
 	if (event[i].res != ai->len) {
 	  derr << "aio to " << ai->off << "~" << ai->len
-	       << " got " << cpp_strerror(event[i].res) << dendl;
+	       << " wrote " << event[i].res << dendl;
 	  assert(0 == "unexpected aio error");
 	}
 	dout(10) << "write_finish_thread_entry aio " << ai->off
@@ -1376,7 +1401,7 @@ void FileJournal::check_aio_completion()
   assert(aio_lock.is_locked());
   dout(20) << "check_aio_completion" << dendl;
 
-  bool completed_something = false;
+  bool completed_something = false, signal = false;
   uint64_t new_journaled_seq = 0;
 
   list<aio_info>::iterator p = aio_queue.begin();
@@ -1390,6 +1415,7 @@ void FileJournal::check_aio_completion()
     aio_num--;
     aio_bytes -= p->len;
     aio_queue.erase(p++);
+    signal = true;
   }
 
   if (completed_something) {
@@ -1409,7 +1435,8 @@ void FileJournal::check_aio_completion()
 	queue_completions_thru(journaled_seq);
       }
     }
-
+  }
+  if (signal) {
     // maybe write queue was waiting for aio count to drop?
     aio_cond.Signal();
   }
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index aefbb5e..39e3429 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1245,10 +1245,18 @@ void ECBackend::submit_transaction(
   for (set<hobject_t>::iterator i = need_hinfos.begin();
        i != need_hinfos.end();
        ++i) {
+    ECUtil::HashInfoRef ref = get_hash_info(*i);
+    if (!ref) {
+      derr << __func__ << ": get_hash_info(" << *i << ")"
+	   << " returned a null pointer and there is no "
+	   << " way to recover from such an error in this "
+	   << " context" << dendl;
+      assert(0);
+    }
     op->unstable_hash_infos.insert(
       make_pair(
 	*i,
-	get_hash_info(*i)));
+	ref));
   }
 
   for (vector<pg_log_entry_t>::iterator i = op->log_entries.begin();
@@ -1458,7 +1466,7 @@ ECUtil::HashInfoRef ECBackend::get_hash_info(
 	::decode(hinfo, bp);
 	assert(hinfo.get_total_chunk_size() == (unsigned)st.st_size);
       } else {
-	assert(0 == "missing hash attr");
+	return ECUtil::HashInfoRef();
       }
     }
     ref = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo);
@@ -1754,31 +1762,37 @@ void ECBackend::be_deep_scrub(
       break;
   }
 
-  ECUtil::HashInfoRef hinfo = get_hash_info(poid);
   if (r == -EIO) {
     dout(0) << "_scan_list  " << poid << " got "
 	    << r << " on read, read_error" << dendl;
     o.read_error = true;
   }
 
-  if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != h.digest()) {
-    dout(0) << "_scan_list  " << poid << " got incorrect hash on read" << dendl;
+  ECUtil::HashInfoRef hinfo = get_hash_info(poid);
+  if (!hinfo) {
+    dout(0) << "_scan_list  " << poid << " could not retrieve hash info" << dendl;
     o.read_error = true;
-  }
+    o.digest_present = false;
+  } else {
+    if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != h.digest()) {
+      dout(0) << "_scan_list  " << poid << " got incorrect hash on read" << dendl;
+      o.read_error = true;
+    }
 
-  if (hinfo->get_total_chunk_size() != pos) {
-    dout(0) << "_scan_list  " << poid << " got incorrect size on read" << dendl;
-    o.read_error = true;
-  }
+    if (hinfo->get_total_chunk_size() != pos) {
+      dout(0) << "_scan_list  " << poid << " got incorrect size on read" << dendl;
+      o.read_error = true;
+    }
 
-  /* We checked above that we match our own stored hash.  We cannot
-   * send a hash of the actual object, so instead we simply send
-   * our locally stored hash of shard 0 on the assumption that if
-   * we match our chunk hash and our recollection of the hash for
-   * chunk 0 matches that of our peers, there is likely no corruption.
-   */
-  o.digest = hinfo->get_chunk_hash(0);
-  o.digest_present = true;
+    /* We checked above that we match our own stored hash.  We cannot
+     * send a hash of the actual object, so instead we simply send
+     * our locally stored hash of shard 0 on the assumption that if
+     * we match our chunk hash and our recollection of the hash for
+     * chunk 0 matches that of our peers, there is likely no corruption.
+     */
+    o.digest = hinfo->get_chunk_hash(0);
+    o.digest_present = true;
+  }
 
   o.omap_digest = 0;
   o.omap_digest_present = true;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index dc67fdd..77ed17a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -3762,6 +3762,53 @@ void OSD::_send_boot()
   monc->send_mon_message(mboot);
 }
 
+bool OSD::_lsb_release_set (char *buf, const char *str, map<string,string> *pm, const char *key)
+{
+  if (strncmp (buf, str, strlen (str)) == 0) {
+    char *value;
+
+    if (buf[strlen(buf)-1] == '\n')
+      buf[strlen(buf)-1] = '\0';
+
+    value = buf + strlen (str) + 1;
+    (*pm)[key] = value;
+
+    return true;
+  }
+  return false;
+}
+
+void OSD::_lsb_release_parse (map<string,string> *pm)
+{
+  FILE *fp = NULL;
+  char buf[512];
+
+  fp = popen("lsb_release -idrc", "r");
+  if (!fp) {
+    int ret = -errno;
+    derr << "lsb_release_parse - failed to call lsb_release binary with error: " << cpp_strerror(ret) << dendl;
+    return;
+  }
+
+  while (fgets(buf, sizeof(buf) - 1, fp) != NULL) {
+    if (_lsb_release_set(buf, "Distributor ID:", pm, "distro")) 
+      continue;
+    if (_lsb_release_set(buf, "Description:", pm, "distro_description"))
+      continue;
+    if (_lsb_release_set(buf, "Release:", pm, "distro_version"))
+      continue;
+    if (_lsb_release_set(buf, "Codename:", pm, "distro_codename"))
+      continue;
+    
+    derr << "unhandled output: " << buf << dendl;
+  }
+
+  if (pclose(fp)) {
+    int ret = -errno;
+    derr << "lsb_release_parse - pclose failed: " << cpp_strerror(ret) << dendl;
+  }
+}
+
 void OSD::_collect_metadata(map<string,string> *pm)
 {
   (*pm)["ceph_version"] = pretty_version_to_str();
@@ -3831,34 +3878,7 @@ void OSD::_collect_metadata(map<string,string> *pm)
   }
 
   // distro info
-  f = fopen("/etc/lsb-release", "r");
-  if (f) {
-    char buf[100];
-    while (!feof(f)) {
-      char *line = fgets(buf, sizeof(buf), f);
-      if (!line)
-	break;
-      char *eq = strchr(buf, '=');
-      if (!eq)
-	break;
-      *eq = '\0';
-      ++eq;
-      while (*eq == '\"')
-	++eq;
-      while (*eq && (eq[strlen(eq)-1] == '\n' ||
-		     eq[strlen(eq)-1] == '\"'))
-	eq[strlen(eq)-1] = '\0';
-      if (strcmp(buf, "DISTRIB_ID") == 0)
-	(*pm)["distro"] = eq;
-      else if (strcmp(buf, "DISTRIB_RELEASE") == 0)
-	(*pm)["distro_version"] = eq;
-      else if (strcmp(buf, "DISTRIB_CODENAME") == 0)
-	(*pm)["distro_codename"] = eq;
-      else if (strcmp(buf, "DISTRIB_DESCRIPTION") == 0)
-	(*pm)["distro_description"] = eq;
-    }
-    fclose(f);
-  }
+  _lsb_release_parse(pm); 
 
   dout(10) << __func__ << " " << *pm << dendl;
 }
@@ -5784,8 +5804,13 @@ bool OSD::advance_pg(
        next_epoch <= osd_epoch && next_epoch <= max;
        ++next_epoch) {
     OSDMapRef nextmap = service.try_get_map(next_epoch);
-    if (!nextmap)
+    if (!nextmap) {
+      dout(20) << __func__ << " missing map " << next_epoch << dendl;
+      // make sure max is bumped up so that we can get past any
+      // gap in maps
+      max = MAX(max, next_epoch + g_conf->osd_map_max_advance);
       continue;
+    }
 
     vector<int> newup, newacting;
     int up_primary, acting_primary;
@@ -5816,7 +5841,7 @@ bool OSD::advance_pg(
   service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
   pg->handle_activate_map(rctx);
   if (next_epoch <= osd_epoch) {
-    dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
+    dout(10) << __func__ << " advanced to max " << max
 	     << " past min epoch " << min_epoch
 	     << " ... will requeue " << *pg << dendl;
     return false;
@@ -5874,10 +5899,7 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)
   while (p != waiting_for_pg.end()) {
     spg_t pgid = p->first;
 
-    vector<int> acting;
-    int nrep = osdmap->pg_to_acting_osds(pgid.pgid, acting);
-    int role = osdmap->calc_pg_role(whoami, acting, nrep);
-    if (role >= 0) {
+    if (osdmap->osd_is_valid_op_target(pgid.pgid, whoami)) {
       ++p;  // still me
     } else {
       dout(10) << " discarding waiting ops for " << pgid << dendl;
@@ -6732,7 +6754,8 @@ void OSD::handle_pg_notify(OpRequestRef op)
       PG::CephPeeringEvtRef(
 	new PG::CephPeeringEvt(
 	  it->first.epoch_sent, it->first.query_epoch,
-	  PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first)))
+	  PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
+          op->get_req()->get_connection()->get_features())))
       );
   }
 }
@@ -7564,7 +7587,7 @@ void OSD::handle_op(OpRequestRef op)
   if (!pg) {
     dout(7) << "hit non-existent pg " << pgid << dendl;
 
-    if (osdmap->get_pg_acting_role(pgid.pgid, whoami) >= 0) {
+    if (osdmap->osd_is_valid_op_target(pgid.pgid, whoami)) {
       dout(7) << "we are valid target for op, waiting" << dendl;
       waiting_for_pg[pgid].push_back(op);
       op->mark_delayed("waiting for pg to exist locally");
@@ -7578,7 +7601,7 @@ void OSD::handle_op(OpRequestRef op)
     }
     OSDMapRef send_map = get_map(m->get_map_epoch());
 
-    if (send_map->get_pg_acting_role(pgid.pgid, whoami) >= 0) {
+    if (send_map->osd_is_valid_op_target(pgid.pgid, whoami)) {
       dout(7) << "dropping request; client will resend when they get new map" << dendl;
     } else if (!send_map->have_pg_pool(pgid.pool())) {
       dout(7) << "dropping request; pool did not exist" << dendl;
@@ -7835,7 +7858,9 @@ void OSD::process_peering_events(
       continue;
     }
     if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
-      pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
+      // we need to requeue the PG explicitly since we didn't actually
+      // handle an event
+      peering_wq.queue(pg);
     } else if (!pg->peering_queue.empty()) {
       PG::CephPeeringEvtRef evt = pg->peering_queue.front();
       pg->peering_queue.pop_front();
@@ -7931,7 +7956,12 @@ void OSD::set_disk_tp_priority()
 	   << dendl;
   int cls =
     ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
-  disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+  if (cls < 0)
+    derr << __func__ << cpp_strerror(cls) << ": "
+	 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
+	 << " but only the following values are allowed: idle, be or rt" << dendl;
+  else
+    disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
 }
 
 // --------------------------------
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index e2a3c8e..dea216d 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1472,6 +1472,8 @@ protected:
   void _maybe_boot(epoch_t oldest, epoch_t newest);
   void _send_boot();
   void _collect_metadata(map<string,string> *pmeta);
+  bool _lsb_release_set(char *buf, const char *str, map<string,string> *pm, const char *key);
+  void _lsb_release_parse (map<string,string> *pm);
 
   void start_waiting_for_healthy();
   bool _is_healthy();
@@ -1738,7 +1740,9 @@ protected:
       pg->put("SnapTrimWQ");
     }
     void _clear() {
-      osd->snap_trim_queue.clear();
+      while (PG *pg = _dequeue()) {
+	pg->put("SnapTrimWQ");
+      }
     }
   } snap_trim_wq;
 
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index a347583..b3b7ab6 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -777,6 +777,18 @@ public:
     return calc_pg_role(osd, group, nrep);
   }
 
+  bool osd_is_valid_op_target(pg_t pg, int osd) const {
+    int primary;
+    vector<int> group;
+    int nrep = pg_to_acting_osds(pg, &group, &primary);
+    if (osd == primary)
+      return true;
+    if (pg_is_ec(pg))
+      return false;
+
+    return calc_pg_role(osd, group, nrep) >= 0;
+  }
+
 
   /*
    * handy helpers to build simple maps...
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 9356df4..ebc2020 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -194,7 +194,8 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   finish_sync_event(NULL),
   scrub_after_recovery(false),
   active_pushes(0),
-  recovery_state(this)
+  recovery_state(this),
+  peer_features((uint64_t)-1)
 {
 #ifdef PG_DEBUG_REFS
   osd->add_pgid(p, this);
@@ -1552,6 +1553,9 @@ void PG::activate(ObjectStore::Transaction& t,
 	pi.hit_set = info.hit_set;
 	pi.stats.stats.clear();
 
+	// initialize peer with our purged_snaps.
+	pi.purged_snaps = info.purged_snaps;
+
 	m = new MOSDPGLog(
 	  i->shard, pg_whoami.shard,
 	  get_osdmap()->get_epoch(), pi);
@@ -4685,7 +4689,10 @@ bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
 void PG::set_last_peering_reset()
 {
   dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
-  last_peering_reset = get_osdmap()->get_epoch();
+  if (last_peering_reset != get_osdmap()->get_epoch()) {
+    last_peering_reset = get_osdmap()->get_epoch();
+    reset_interval_flush();
+  }
 }
 
 struct FlushState {
@@ -4739,7 +4746,6 @@ void PG::start_peering_interval(
   const OSDMapRef osdmap = get_osdmap();
 
   set_last_peering_reset();
-  reset_interval_flush();
 
   vector<int> oldacting, oldup;
   int oldrole = get_role();
@@ -5277,37 +5283,6 @@ void PG::queue_peering_event(CephPeeringEvtRef evt)
   osd->queue_for_peering(this);
 }
 
-void PG::queue_notify(epoch_t msg_epoch,
-		      epoch_t query_epoch,
-		      pg_shard_t from, pg_notify_t& i)
-{
-  dout(10) << "notify " << i << " from replica " << from << dendl;
-  queue_peering_event(
-    CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
-					 MNotifyRec(from, i))));
-}
-
-void PG::queue_info(epoch_t msg_epoch,
-		     epoch_t query_epoch,
-		     pg_shard_t from, pg_info_t& i)
-{
-  dout(10) << "info " << i << " from replica " << from << dendl;
-  queue_peering_event(
-    CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
-					 MInfoRec(from, i, msg_epoch))));
-}
-
-void PG::queue_log(epoch_t msg_epoch,
-		   epoch_t query_epoch,
-		   pg_shard_t from,
-		   MOSDPGLog *msg)
-{
-  dout(10) << "log " << *msg << " from replica " << from << dendl;
-  queue_peering_event(
-    CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
-					 MLogRec(from, msg))));
-}
-
 void PG::queue_null(epoch_t msg_epoch,
 		    epoch_t query_epoch)
 {
@@ -5810,8 +5785,29 @@ PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
 
+  for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
+       it != pg->backfill_targets.end();
+       ++it) {
+    assert(*it != pg->pg_whoami);
+    ConnectionRef con = pg->osd->get_con_osd_cluster(
+      it->osd, pg->get_osdmap()->get_epoch());
+    if (con) {
+      if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
+        pg->osd->send_message_osd_cluster(
+          new MBackfillReserve(
+	    MBackfillReserve::REJECT,
+	    spg_t(pg->info.pgid.pgid, it->shard),
+	    pg->get_osdmap()->get_epoch()),
+	  con.get());
+      }
+    }
+  }
+
   pg->osd->recovery_wq.dequeue(pg);
 
+  pg->waiting_on_backfill.clear();
+  pg->finish_recovery_op(hobject_t::get_max());
+
   pg->schedule_backfill_full_retry();
   return transit<NotBackfilling>();
 }
@@ -6066,14 +6062,33 @@ boost::statechart::result
 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  pg->osd->send_message_osd_cluster(
-    pg->primary.osd,
-    new MBackfillReserve(
-      MBackfillReserve::GRANT,
-      spg_t(pg->info.pgid.pgid, pg->primary.shard),
-      pg->get_osdmap()->get_epoch()),
-    pg->get_osdmap()->get_epoch());
-  return transit<RepRecovering>();
+
+  double ratio, max_ratio;
+  if (g_conf->osd_debug_reject_backfill_probability > 0 &&
+      (rand()%1000 < (g_conf->osd_debug_reject_backfill_probability*1000.0))) {
+    dout(10) << "backfill reservation rejected after reservation: "
+	     << "failure injection" << dendl;
+    pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+    post_event(RemoteReservationRejected());
+    return discard_event();
+  } else if (pg->osd->too_full_for_backfill(&ratio, &max_ratio) &&
+	     !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+    dout(10) << "backfill reservation rejected after reservation: full ratio is "
+	     << ratio << ", which is greater than max allowed ratio "
+	     << max_ratio << dendl;
+    pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+    post_event(RemoteReservationRejected());
+    return discard_event();
+  } else {
+    pg->osd->send_message_osd_cluster(
+      pg->primary.osd,
+      new MBackfillReserve(
+	MBackfillReserve::GRANT,
+	spg_t(pg->info.pgid.pgid, pg->primary.shard),
+	pg->get_osdmap()->get_epoch()),
+      pg->get_osdmap()->get_epoch());
+    return transit<RepRecovering>();
+  }
 }
 
 boost::statechart::result
@@ -6097,7 +6112,7 @@ PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->reject_reservation();
-  return transit<RepNotRecovering>();
+  return discard_event();
 }
 
 void PG::RecoveryState::RepRecovering::exit()
@@ -6839,6 +6854,7 @@ PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
 
   pg->publish_stats_to_osd();
 
+  pg->reset_peer_features();
   get_infos();
   if (peer_info_requested.empty() && !prior_set->pg_down) {
     post_event(GotInfo());
@@ -6906,6 +6922,9 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
       }
       get_infos();
     }
+    dout(20) << "Adding osd: " << infoevt.from.osd << " features: "
+      << hex << infoevt.features << dec << dendl;
+    pg->apply_peer_features(infoevt.features);
 
     // are we done getting everything?
     if (peer_info_requested.empty() && !prior_set->pg_down) {
@@ -6964,6 +6983,7 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
 	  break;
 	}
       }
+      dout(20) << "Common features: " << hex << pg->get_min_peer_features() << dec << dendl;
       post_event(GotInfo());
     }
   }
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 1aadaf0..e319477 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1308,10 +1308,12 @@ public:
   struct MNotifyRec : boost::statechart::event< MNotifyRec > {
     pg_shard_t from;
     pg_notify_t notify;
-    MNotifyRec(pg_shard_t from, pg_notify_t &notify) :
-      from(from), notify(notify) {}
+    uint64_t features;
+    MNotifyRec(pg_shard_t from, pg_notify_t &notify, uint64_t f) :
+      from(from), notify(notify), features(f) {}
     void print(std::ostream *out) const {
-      *out << "MNotifyRec from " << from << " notify: " << notify;
+      *out << "MNotifyRec from " << from << " notify: " << notify
+        << " features: 0x" << hex << features << dec;
     }
   };
 
@@ -1993,11 +1995,16 @@ public:
   // Prevent copying
   PG(const PG& rhs);
   PG& operator=(const PG& rhs);
+  uint64_t peer_features;
 
  public:
   spg_t      get_pgid() const { return info.pgid; }
   int        get_nrep() const { return acting.size(); }
 
+  void reset_peer_features() { peer_features = (uint64_t)-1; }
+  uint64_t get_min_peer_features() { return peer_features; }
+  void apply_peer_features(uint64_t f) { peer_features &= f; }
+
   void init_primary_up_acting(
     const vector<int> &newup,
     const vector<int> &newacting,
@@ -2189,12 +2196,6 @@ public:
   void take_waiters();
   void queue_peering_event(CephPeeringEvtRef evt);
   void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
-  void queue_notify(epoch_t msg_epoch, epoch_t query_epoch,
-		    pg_shard_t from, pg_notify_t& i);
-  void queue_info(epoch_t msg_epoch, epoch_t query_epoch,
-		  pg_shard_t from, pg_info_t& i);
-  void queue_log(epoch_t msg_epoch, epoch_t query_epoch, pg_shard_t from,
-		 MOSDPGLog *msg);
   void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
 		   pg_shard_t from, const pg_query_t& q);
   void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index d23e6fc..f1911c1 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -595,6 +595,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
   if (command == "query") {
     f->open_object_section("pg");
     f->dump_string("state", pg_state_string(get_state()));
+    f->dump_stream("snap_trimq") << snap_trimq;
     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
     f->open_array_section("up");
     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
@@ -2072,12 +2073,16 @@ void ReplicatedPG::do_scan(
       }
       peer_backfill_info[from] = bi;
 
-      assert(waiting_on_backfill.find(from) != waiting_on_backfill.end());
-      waiting_on_backfill.erase(from);
+      if (waiting_on_backfill.find(from) != waiting_on_backfill.end()) {
+	waiting_on_backfill.erase(from);
 
-      if (waiting_on_backfill.empty()) {
-        assert(peer_backfill_info.size() == backfill_targets.size());
-        finish_recovery_op(hobject_t::get_max());
+	if (waiting_on_backfill.empty()) {
+	  assert(peer_backfill_info.size() == backfill_targets.size());
+	  finish_recovery_op(hobject_t::get_max());
+	}
+      } else {
+	// we canceled backfill for a while due to a too full, and this
+	// is an extra response from a non-too-full peer
       }
     }
     break;
@@ -2560,10 +2565,6 @@ void ReplicatedPG::snap_trimmer()
     // replica collection trimming
     snap_trimmer_machine.process_event(SnapTrim());
   }
-  if (snap_trimmer_machine.requeue) {
-    dout(10) << "snap_trimmer requeue" << dendl;
-    queue_snap_trim();
-  }
   unlock();
   return;
 }
@@ -3353,6 +3354,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    break;
 	}
 	result = _delete_oid(ctx, true);
+	if (result >= 0) {
+	  // mark that this is a cache eviction to avoid triggering normal
+	  // make_writeable() clone or snapdir object creation in finish_ctx()
+	  ctx->cache_evict = true;
+	}
 	osd->logger->inc(l_osd_tier_evict);
       }
       break;
@@ -3645,6 +3651,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_SETALLOCHINT:
       ++ctx->num_write;
       {
+        if (!(get_min_peer_features() & CEPH_FEATURE_OSD_SET_ALLOC_HINT)) { 
+          result = -EOPNOTSUPP;
+          break;
+        }
         if (!obs.exists) {
           ctx->mod_desc.create();
           t->touch(soid);
@@ -4836,6 +4846,7 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
   
   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
       snapc.snaps.size() &&                 // there are snaps
+      !ctx->cache_evict &&
       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
     // clone
     hobject_t coid = soid;
@@ -5156,7 +5167,9 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
 	  ctx->snapset_obc->obs.exists = false;
 	}
       }
-    } else if (ctx->new_snapset.clones.size()) {
+    } else if (ctx->new_snapset.clones.size() &&
+	       !ctx->cache_evict &&
+	       (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
       // save snapset on _snap
       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.hash,
 			info.pgid.pool(), soid.get_namespace());
@@ -5167,7 +5180,8 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
 	                                eversion_t(),
 					0, osd_reqid_t(), ctx->mtime));
 
-      ctx->snapset_obc = get_object_context(snapoid, true);
+      if (!ctx->snapset_obc)
+	ctx->snapset_obc = get_object_context(snapoid, true);
       bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
       assert(got);
       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
@@ -6968,6 +6982,7 @@ void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
     if (get_osdmap()->is_blacklisted(ea)) {
       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
       assert(j->second->get_pg() == this);
+      j->second->unregister_cb();
       handle_watch_timeout(j->second);
     }
   }
@@ -9334,6 +9349,13 @@ void ReplicatedPG::on_removal(ObjectStore::Transaction *t)
   // adjust info to backfill
   info.last_backfill = hobject_t();
   dirty_info = true;
+
+
+  // clear log
+  PGLogEntryHandler rollbacker;
+  pg_log.clear_can_rollback_to(&rollbacker);
+  rollbacker.apply(this, t);
+
   write_if_dirty(*t);
 
   on_shutdown();
@@ -10952,7 +10974,7 @@ void ReplicatedPG::hit_set_persist()
       pg_log_entry_t::MODIFY,
       oid,
       ctx->at_version,
-      ctx->obs->oi.version,
+      eversion_t(),
       0,
       osd_reqid_t(),
       ctx->mtime)
@@ -11974,13 +11996,11 @@ ReplicatedPG::NotTrimming::NotTrimming(my_context ctx)
   : my_base(ctx), 
     NamedState(context< SnapTrimmer >().pg->cct, "NotTrimming")
 {
-  context< SnapTrimmer >().requeue = false;
   context< SnapTrimmer >().log_enter(state_name);
 }
 
 void ReplicatedPG::NotTrimming::exit()
 {
-  context< SnapTrimmer >().requeue = true;
   context< SnapTrimmer >().log_exit(state_name, enter_time);
 }
 
@@ -12040,32 +12060,45 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&)
 
   dout(10) << "TrimmingObjects: trimming snap " << snap_to_trim << dendl;
 
-  // Get next
-  hobject_t old_pos = pos;
-  int r = pg->snap_mapper.get_next_object_to_trim(snap_to_trim, &pos);
-  if (r != 0 && r != -ENOENT) {
-    derr << __func__ << ": get_next returned " << cpp_strerror(r) << dendl;
-    assert(0);
-  } else if (r == -ENOENT) {
-    // Done!
-    dout(10) << "TrimmingObjects: got ENOENT" << dendl;
-    post_event(SnapTrim());
-    return transit< WaitingOnReplicas >();
+  for (set<RepGather *>::iterator i = repops.begin();
+       i != repops.end(); 
+       ) {
+    if ((*i)->all_applied && (*i)->all_committed) {
+      (*i)->put();
+      repops.erase(i++);
+    } else {
+      ++i;
+    }
   }
 
-  dout(10) << "TrimmingObjects react trimming " << pos << dendl;
-  RepGather *repop = pg->trim_object(pos);
-  if (!repop) {
-    dout(10) << __func__ << " could not get write lock on obj "
-	     << pos << dendl;
-    pos = old_pos;
-    return discard_event();
-  }
-  assert(repop);
-  repop->queue_snap_trimmer = true;
+  while (repops.size() < g_conf->osd_pg_max_concurrent_snap_trims) {
+    // Get next
+    hobject_t old_pos = pos;
+    int r = pg->snap_mapper.get_next_object_to_trim(snap_to_trim, &pos);
+    if (r != 0 && r != -ENOENT) {
+      derr << __func__ << ": get_next returned " << cpp_strerror(r) << dendl;
+      assert(0);
+    } else if (r == -ENOENT) {
+      // Done!
+      dout(10) << "TrimmingObjects: got ENOENT" << dendl;
+      post_event(SnapTrim());
+      return transit< WaitingOnReplicas >();
+    }
+
+    dout(10) << "TrimmingObjects react trimming " << pos << dendl;
+    RepGather *repop = pg->trim_object(pos);
+    if (!repop) {
+      dout(10) << __func__ << " could not get write lock on obj "
+	       << pos << dendl;
+      pos = old_pos;
+      return discard_event();
+    }
+    assert(repop);
+    repop->queue_snap_trimmer = true;
 
-  repops.insert(repop->get());
-  pg->simple_repop_submit(repop);
+    repops.insert(repop->get());
+    pg->simple_repop_submit(repop);
+  }
   return discard_event();
 }
 /* WaitingOnReplicasObjects */
@@ -12074,7 +12107,6 @@ ReplicatedPG::WaitingOnReplicas::WaitingOnReplicas(my_context ctx)
     NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitingOnReplicas")
 {
   context< SnapTrimmer >().log_enter(state_name);
-  context< SnapTrimmer >().requeue = false;
 }
 
 void ReplicatedPG::WaitingOnReplicas::exit()
@@ -12099,7 +12131,7 @@ boost::statechart::result ReplicatedPG::WaitingOnReplicas::react(const SnapTrim&
   for (set<RepGather *>::iterator i = repops.begin();
        i != repops.end();
        repops.erase(i++)) {
-    if (!(*i)->all_applied) {
+    if (!(*i)->all_applied || !(*i)->all_committed) {
       return discard_event();
     } else {
       (*i)->put();
@@ -12124,7 +12156,7 @@ boost::statechart::result ReplicatedPG::WaitingOnReplicas::react(const SnapTrim&
   context<SnapTrimmer>().need_share_pg_info = true;
 
   // Back to the start
-  post_event(SnapTrim());
+  pg->queue_snap_trim();
   return transit< NotTrimming >();
 }
 
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 9ef131c..dc8ee62 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -438,6 +438,7 @@ public:
     bool modify;          // (force) modification (even if op_t is empty)
     bool user_modify;     // user-visible modification
     bool undirty;         // user explicitly un-dirtying this object
+    bool cache_evict;     ///< true if this is a cache eviction
 
     // side effects
     list<watch_info_t> watch_connects;
@@ -539,7 +540,7 @@ public:
 	      ReplicatedPG *_pg) :
       op(_op), reqid(_reqid), ops(_ops), obs(_obs), snapset(0),
       new_obs(_obs->oi, _obs->exists),
-      modify(false), user_modify(false), undirty(false),
+      modify(false), user_modify(false), undirty(false), cache_evict(false),
       bytes_written(0), bytes_read(0), user_at_version(0),
       current_osd_subop_num(0),
       op_t(NULL),
@@ -1331,8 +1332,7 @@ private:
     set<RepGather *> repops;
     snapid_t snap_to_trim;
     bool need_share_pg_info;
-    bool requeue;
-    SnapTrimmer(ReplicatedPG *pg) : pg(pg), need_share_pg_info(false), requeue(false) {}
+    SnapTrimmer(ReplicatedPG *pg) : pg(pg), need_share_pg_info(false) {}
     ~SnapTrimmer();
     void log_enter(const char *state_name);
     void log_exit(const char *state_name, utime_t duration);
diff --git a/src/osd/Watch.h b/src/osd/Watch.h
index e2cbfc1..91a4574 100644
--- a/src/osd/Watch.h
+++ b/src/osd/Watch.h
@@ -98,6 +98,7 @@ class Notify {
   /// removes the timeout callback, called on completion or cancellation
   void unregister_cb();
 public:
+
   string gen_dbg_prefix() {
     stringstream ss;
     ss << "Notify(" << make_pair(cookie, notify_id) << " "
@@ -172,15 +173,15 @@ class Watch {
   /// Registers the timeout callback with watch_timer
   void register_cb();
 
-  /// Unregisters the timeout callback
-  void unregister_cb();
-
   /// send a Notify message when connected for notif
   void send_notify(NotifyRef notif);
 
   /// Cleans up state on discard or remove (including Connection state, obc)
   void discard_state();
 public:
+  /// Unregisters the timeout callback
+  void unregister_cb();
+
   /// NOTE: must be called with pg lock held
   ~Watch();
 
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 16bdbaf..d08e9b7 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2179,9 +2179,9 @@ void pg_interval_t::dump(Formatter *f) const
   f->open_array_section("acting");
   for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
     f->dump_int("osd", *p);
+  f->close_section();
   f->dump_int("primary", primary);
   f->dump_int("up_primary", up_primary);
-  f->close_section();
 }
 
 void pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
@@ -2235,9 +2235,15 @@ bool pg_interval_t::check_new_interval(
     i.primary = old_acting_primary;
     i.up_primary = old_up_primary;
 
-    if (!i.acting.empty() && i.primary != -1 &&
-	i.acting.size() >=
-	lastmap->get_pools().find(pool_id)->second.min_size) {
+    unsigned num_acting = 0;
+    for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
+	 ++p)
+      if (*p != CRUSH_ITEM_NONE)
+	++num_acting;
+
+    if (num_acting &&
+	i.primary != -1 &&
+	num_acting >= lastmap->get_pools().find(pgid.pool())->second.min_size) {
       if (out)
 	*out << "generate_past_intervals " << i
 	     << ": not rw,"
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index e1499b4..95abee1 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -796,6 +796,8 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid,
   ldout(cct, 20) << "finishing waiters " << ls << dendl;
 
   finish_contexts(cct, ls, err);
+  retry_waiting_reads();
+
   --reads_outstanding;
   read_cond.Signal();
 }
@@ -1105,18 +1107,35 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
       // TODO: make read path not call _readx for every completion
       hits.insert(errors.begin(), errors.end());
     }
-    
+
     if (!missing.empty() || !rx.empty()) {
       // read missing
       for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin();
            bh_it != missing.end();
            ++bh_it) {
-        bh_read(bh_it->second);
-        if (success && onfinish) {
-          ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second 
-                   << " off " << bh_it->first << dendl;
-	  bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, oset, onfinish) );
-        }
+	uint64_t rx_bytes = static_cast<uint64_t>(
+	  stat_rx + bh_it->second->length());
+	if (!waitfor_read.empty() || rx_bytes > max_size) {
+	  // cache is full with concurrent reads -- wait for rx's to complete
+	  // to constrain memory growth (especially during copy-ups)
+	  if (success) {
+	    ldout(cct, 10) << "readx missed, waiting on cache to complete "
+			   << waitfor_read.size() << " blocked reads, "
+			   << (MAX(rx_bytes, max_size) - max_size)
+			   << " read bytes" << dendl;
+	    waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish));
+	  }
+
+	  bh_remove(o, bh_it->second);
+	  delete bh_it->second;
+	} else {
+	  bh_read(bh_it->second);
+	  if (success && onfinish) {
+	    ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
+			   << " off " << bh_it->first << dendl;
+	    bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, oset, onfinish) );
+	  }
+	}
         bytes_not_in_cache += bh_it->second->length();
 	success = false;
       }
@@ -1230,7 +1249,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
   // no misses... success!  do the read.
   assert(!hit_ls.empty());
   ldout(cct, 10) << "readx has all buffers" << dendl;
-  
+
   // ok, assemble into result buffer.
   uint64_t pos = 0;
   if (rd->bl && !error) {
@@ -1263,6 +1282,18 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
   return ret;
 }
 
+void ObjectCacher::retry_waiting_reads()
+{
+  list<Context *> ls;
+  ls.swap(waitfor_read);
+
+  while (!ls.empty() && waitfor_read.empty()) {
+    Context *ctx = ls.front();
+    ls.pop_front();
+    ctx->complete(0);
+  }
+  waitfor_read.splice(waitfor_read.end(), ls);
+}
 
 int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
 			 Context *onfreespace)
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index d2aebe9..b48f8ac 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -341,6 +341,8 @@ class ObjectCacher {
 
   vector<ceph::unordered_map<sobject_t, Object*> > objects; // indexed by pool_id
 
+  list<Context*> waitfor_read;
+
   ceph_tid_t last_read_tid;
 
   set<BufferHead*>    dirty_bh;
@@ -457,6 +459,7 @@ class ObjectCacher {
 
   int _readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	     bool external_call);
+  void retry_waiting_reads();
 
  public:
   void bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid,
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index d82b3e1..57954a5 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1615,7 +1615,10 @@ void Objecter::finish_op(Op *op)
 
   ops.erase(op->tid);
   logger->set(l_osdc_op_active, ops.size());
-  assert(check_latest_map_ops.find(op->tid) == check_latest_map_ops.end());
+
+  // our reply may have raced with pool deletion resulting in a map
+  // check in flight.
+  op_cancel_map_check(op);
 
   if (op->ontimeout)
     timer.cancel_event(op->ontimeout);
@@ -1640,7 +1643,9 @@ void Objecter::send_op(Op *op)
     ldout(cct, 20) << " revoking rx buffer for " << op->tid << " on " << op->con << dendl;
     op->con->revoke_rx_buffer(op->tid);
   }
-  if (op->outbl && op->outbl->length()) {
+  if (op->outbl &&
+      op->ontimeout == NULL &&  // only post rx_buffer if no timeout; see #9582
+      op->outbl->length()) {
     ldout(cct, 20) << " posting rx buffer for " << op->tid << " on " << op->session->con << dendl;
     op->con = op->session->con;
     op->con->post_rx_buffer(op->tid, *op->outbl);
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index 0fbd10e..ec68919 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -246,7 +246,8 @@ Rados object in state %s." % (self.state))
 
     def shutdown(self):
         """
-        Disconnects from the cluster.
+        Disconnects from the cluster.  Call this explicitly when a
+        Rados.connect()ed object is no longer used.
         """
         if (self.__dict__.has_key("state") and self.state != "shutdown"):
             run_in_thread(self.librados.rados_shutdown, (self.cluster,))
@@ -260,9 +261,6 @@ Rados object in state %s." % (self.state))
         self.shutdown()
         return False
 
-    def __del__(self):
-        self.shutdown()
-
     def version(self):
         """
         Get the version number of the ``librados`` C library.
@@ -410,7 +408,7 @@ Rados object in state %s." % (self.state))
 
     def connect(self, timeout=0):
         """
-        Connect to the cluster.
+        Connect to the cluster.  Use shutdown() to release resources.
         """
         self.require_state("configuring")
         ret = run_in_thread(self.librados.rados_connect, (self.cluster,),
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index 3d6886d..78c022b 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -53,6 +53,18 @@ LIBRGW_DEPS += \
 	-lfcgi \
 	-ldl
 
+CIVETWEB_INCLUDE = --include civetweb/include/civetweb_conf.h
+
+libcivetweb_la_SOURCES =  \
+	rgw/rgw_civetweb.cc \
+	rgw/rgw_civetweb_log.cc \
+	civetweb/src/civetweb.c
+
+libcivetweb_la_CXXFLAGS = ${CIVETWEB_INCLUDE} -Woverloaded-virtual ${AM_CXXFLAGS}
+libcivetweb_la_CFLAGS = -Icivetweb/include ${CIVETWEB_INCLUDE}
+
+noinst_LTLIBRARIES += libcivetweb.la
+
 radosgw_SOURCES = \
 	rgw/rgw_resolve.cc \
 	rgw/rgw_rest.cc \
@@ -71,11 +83,9 @@ radosgw_SOURCES = \
 	rgw/rgw_swift.cc \
 	rgw/rgw_swift_auth.cc \
 	rgw/rgw_loadgen.cc \
-	rgw/rgw_civetweb.cc \
-	civetweb/src/civetweb.c \
 	rgw/rgw_main.cc
-radosgw_CFLAGS = -Icivetweb/include
-radosgw_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
+radosgw_CFLAGS = -I$(srcdir)/civetweb/include
+radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
 bin_PROGRAMS += radosgw
 
 radosgw_admin_SOURCES = rgw/rgw_admin.cc
@@ -158,7 +168,9 @@ noinst_HEADERS += \
 	rgw/rgw_bucket.h \
 	rgw/rgw_keystone.h \
 	rgw/rgw_civetweb.h \
+	rgw/rgw_civetweb_log.h \
 	civetweb/civetweb.h \
 	civetweb/include/civetweb.h \
+	civetweb/include/civetweb_conf.h \
 	civetweb/src/md5.h
 
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index e31a28a..f6c5619 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -679,7 +679,11 @@ void set_quota_info(RGWQuotaInfo& quota, int opt_cmd, int64_t max_size, int64_t
         quota.max_objects = max_objects;
       }
       if (have_max_size) {
-        quota.max_size_kb = rgw_rounded_kb(max_size);
+        if (max_size < 0) {
+          quota.max_size_kb = -1;
+        } else {
+          quota.max_size_kb = rgw_rounded_kb(max_size);
+        }
       }
       break;
     case OPT_QUOTA_DISABLE:
@@ -1364,7 +1368,13 @@ int main(int argc, char **argv)
       cerr << "could not create user: " << err_msg << std::endl;
       return -ret;
     }
-
+    if (!subuser.empty()) {
+      ret = user.subusers.add(user_op, &err_msg);
+      if (ret < 0) {
+        cerr << "could not create subuser: " << err_msg << std::endl;
+        return -ret;
+      }
+    }
     break;
   case OPT_USER_RM:
     ret = user.remove(user_op, &err_msg);
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index a31177f..b44a40c 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -42,7 +42,7 @@ int RGWMongoose::complete_request()
 
       if (0 && data.length() == 0) {
         has_content_length = true;
-        print("Transfer-Enconding: %s\n", "chunked");
+        print("Transfer-Enconding: %s\r\n", "chunked");
         data.append("0\r\n\r\n", sizeof("0\r\n\r\n")-1);
       } else {
         int r = send_content_length(data.length());
@@ -128,7 +128,7 @@ int RGWMongoose::send_status(const char *status, const char *status_name)
   if (!status_name)
     status_name = "";
 
-  snprintf(buf, sizeof(buf), "HTTP/1.1 %s %s\n", status, status_name);
+  snprintf(buf, sizeof(buf), "HTTP/1.1 %s %s\r\n", status, status_name);
 
   bufferlist bl;
   bl.append(buf);
@@ -168,5 +168,5 @@ int RGWMongoose::send_content_length(uint64_t len)
   has_content_length = true;
   char buf[21];
   snprintf(buf, sizeof(buf), "%"PRIu64, len);
-  return print("Content-Length: %s\n", buf);
+  return print("Content-Length: %s\r\n", buf);
 }
diff --git a/src/rgw/rgw_civetweb_log.cc b/src/rgw/rgw_civetweb_log.cc
new file mode 100644
index 0000000..720bab5
--- /dev/null
+++ b/src/rgw/rgw_civetweb_log.cc
@@ -0,0 +1,14 @@
+#include "common/config.h"
+#include "rgw_common.h"
+
+#include "civetweb/civetweb.h"
+
+#define dout_subsys ceph_subsys_civetweb
+
+
+int rgw_civetweb_log_callback(const struct mg_connection *conn, const char *buf) {
+  dout(10) << "civetweb: " << (void *)conn << ": " << buf << dendl;
+  return 0;
+}
+
+
diff --git a/src/rgw/rgw_civetweb_log.h b/src/rgw/rgw_civetweb_log.h
new file mode 100644
index 0000000..6c6b2c0
--- /dev/null
+++ b/src/rgw/rgw_civetweb_log.h
@@ -0,0 +1,6 @@
+#ifndef CEPH_RGW_CIVETWEB_LOG_H
+#define CEPH_RGW_CIVETWEB_LOG_H
+
+int rgw_civetweb_log_callback(const struct mg_connection *conn, const char *buf);
+
+#endif
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 5a1043f..36b8ed3 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -506,7 +506,7 @@ int XMLArgs::parse()
     }
     string substr, nameval;
     substr = str.substr(pos, fpos - pos);
-    url_decode(substr, nameval);
+    url_decode(substr, nameval, true);
     NameVal nv(nameval);
     int ret = nv.parse();
     if (ret >= 0) {
@@ -690,14 +690,13 @@ static char hex_to_num(char c)
   return hex_table.to_num(c);
 }
 
-bool url_decode(string& src_str, string& dest_str)
+bool url_decode(string& src_str, string& dest_str, bool in_query)
 {
   const char *src = src_str.c_str();
   char dest[src_str.size() + 1];
   int pos = 0;
   char c;
 
-  bool in_query = false;
   while (*src) {
     if (*src != '%') {
       if (!in_query || *src != '+') {
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 4d7a118..432e82a 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -1343,7 +1343,7 @@ extern bool verify_object_permission(struct req_state *s, RGWAccessControlPolicy
 extern bool verify_object_permission(struct req_state *s, int perm);
 /** Convert an input URL into a sane object name
  * by converting %-escaped strings into characters, etc*/
-extern bool url_decode(string& src_str, string& dest_str);
+extern bool url_decode(string& src_str, string& dest_str, bool in_query = false);
 extern void url_encode(const string& src, string& dst);
 
 extern void calc_hmac_sha1(const char *key, int key_len,
diff --git a/src/rgw/rgw_fcgi.cc b/src/rgw/rgw_fcgi.cc
index 4b24dab..a9af45c 100644
--- a/src/rgw/rgw_fcgi.cc
+++ b/src/rgw/rgw_fcgi.cc
@@ -32,7 +32,7 @@ void RGWFCGX::init_env(CephContext *cct)
 
 int RGWFCGX::send_status(const char *status, const char *status_name)
 {
-  return print("Status: %s\n", status);
+  return print("Status: %s %s\r\n", status, status_name);
 }
 
 int RGWFCGX::send_100_continue()
@@ -48,7 +48,7 @@ int RGWFCGX::send_content_length(uint64_t len)
 {
   char buf[21];
   snprintf(buf, sizeof(buf), "%"PRIu64, len);
-  return print("Content-Length: %s\n", buf);
+  return print("Content-Length: %s\r\n", buf);
 }
 
 int RGWFCGX::complete_header()
diff --git a/src/rgw/rgw_http_client.cc b/src/rgw/rgw_http_client.cc
index 1c6b6d4..3adc0ae 100644
--- a/src/rgw/rgw_http_client.cc
+++ b/src/rgw/rgw_http_client.cc
@@ -42,17 +42,8 @@ static size_t send_http_data(void *ptr, size_t size, size_t nmemb, void *_info)
   return ret;
 }
 
-int RGWHTTPClient::process(const char *method, const char *url)
+static curl_slist *headers_to_slist(list<pair<string, string> >& headers)
 {
-  int ret = 0;
-  CURL *curl_handle;
-
-  char error_buf[CURL_ERROR_SIZE];
-
-  curl_handle = curl_easy_init();
-
-  dout(20) << "sending request to " << url << dendl;
-
   curl_slist *h = NULL;
 
   list<pair<string, string> >::iterator iter;
@@ -63,11 +54,37 @@ int RGWHTTPClient::process(const char *method, const char *url)
     if (strncmp(val.c_str(), "HTTP_", 5) == 0) {
       val = val.substr(5);
     }
+
+    /* we need to convert all underscores into dashes as some web servers forbid them
+     * in the http header field names
+     */
+    for (size_t i = 0; i < val.size(); i++) {
+      if (val[i] == '_') {
+        val[i] = '-';
+      }
+    }
+
     val.append(": ");
     val.append(p.second);
     h = curl_slist_append(h, val.c_str());
   }
 
+  return h;
+}
+
+int RGWHTTPClient::process(const char *method, const char *url)
+{
+  int ret = 0;
+  CURL *curl_handle;
+
+  char error_buf[CURL_ERROR_SIZE];
+
+  curl_handle = curl_easy_init();
+
+  dout(20) << "sending request to " << url << dendl;
+
+  curl_slist *h = headers_to_slist(headers);
+
   curl_easy_setopt(curl_handle, CURLOPT_CUSTOMREQUEST, method);
   curl_easy_setopt(curl_handle, CURLOPT_URL, url);
   curl_easy_setopt(curl_handle, CURLOPT_NOPROGRESS, 1L);
@@ -139,20 +156,7 @@ int RGWHTTPClient::init_async(const char *method, const char *url, void **handle
 
   dout(20) << "sending request to " << url << dendl;
 
-  curl_slist *h = NULL;
-
-  list<pair<string, string> >::iterator iter;
-  for (iter = headers.begin(); iter != headers.end(); ++iter) {
-    pair<string, string>& p = *iter;
-    string val = p.first;
-
-    if (strncmp(val.c_str(), "HTTP_", 5) == 0) {
-      val = val.substr(5);
-    }
-    val.append(": ");
-    val.append(p.second);
-    h = curl_slist_append(h, val.c_str());
-  }
+  curl_slist *h = headers_to_slist(headers);
 
   req_data->h = h;
 
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index a198678..c0f8311 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -429,7 +429,7 @@ static void decode_swift_keys(map<string, RGWAccessKey>& m, JSONObj *o)
 {
   RGWAccessKey k;
   k.decode_json(o, true);
-  m[k.subuser] = k;
+  m[k.id] = k;
 }
 
 static void decode_subusers(map<string, RGWSubUser>& m, JSONObj *o)
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 9614b07..fc40b64 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -54,6 +54,7 @@
 #include "rgw_resolve.h"
 #include "rgw_loadgen.h"
 #include "rgw_civetweb.h"
+#include "rgw_civetweb_log.h"
 
 #include "civetweb/civetweb.h"
 
@@ -93,6 +94,8 @@ struct RGWRequest
   RGWRequest() : id(0), s(NULL), op(NULL) {
   }
 
+  virtual ~RGWRequest() {}
+
   void init_state(req_state *_s) {
     s = _s;
   }
@@ -141,6 +144,8 @@ public:
   bool get_val(const string& key, const string& def_val, string *out);
   bool get_val(const string& key, int def_val, int *out);
 
+  map<string, string>& get_config_map() { return config_map; }
+
   string get_framework() { return framework; }
 };
 
@@ -636,6 +641,10 @@ void RGWFCGXProcess::handle_request(RGWRequest *r)
 
   FCGX_Finish_r(fcgx);
 
+  if (store->ctx()->_conf->rgw_fcgi_explicit_free) {
+    FCGX_Free(fcgx, 1);
+  }
+
   delete req;
 }
 
@@ -909,6 +918,12 @@ class RGWMongooseFrontend : public RGWFrontend {
   struct mg_context *ctx;
   RGWProcessEnv env;
 
+  void set_conf_default(map<string, string>& m, const string& key, const string& def_val) {
+    if (m.find(key) == m.end()) {
+      m[key] = def_val;
+    }
+  }
+
 public:
   RGWMongooseFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf) : conf(_conf), ctx(NULL), env(pe) {
   }
@@ -921,12 +936,28 @@ public:
     char thread_pool_buf[32];
     snprintf(thread_pool_buf, sizeof(thread_pool_buf), "%d", (int)g_conf->rgw_thread_pool_size);
     string port_str;
+    map<string, string> conf_map = conf->get_config_map();
     conf->get_val("port", "80", &port_str);
-    const char *options[] = {"listening_ports", port_str.c_str(), "enable_keep_alive", "yes", "num_threads", thread_pool_buf, NULL};
+    conf_map.erase("port");
+    conf_map["listening_ports"] = port_str;
+    set_conf_default(conf_map, "enable_keep_alive", "yes");
+    set_conf_default(conf_map, "num_threads", thread_pool_buf);
+    set_conf_default(conf_map, "decode_url", "no");
+
+    const char *options[conf_map.size() * 2 + 1];
+    int i = 0;
+    for (map<string, string>::iterator iter = conf_map.begin(); iter != conf_map.end(); ++iter) {
+      options[i] = iter->first.c_str();
+      options[i + 1] = iter->second.c_str();
+      dout(20)<< "civetweb config: " << options[i] << ": " << (options[i + 1] ? options[i + 1] : "<null>") << dendl;
+      i += 2;
+    }
+    options[i] = NULL;
 
     struct mg_callbacks cb;
     memset((void *)&cb, 0, sizeof(cb));
     cb.begin_request = civetweb_callback;
+    cb.log_message = rgw_civetweb_log_callback;
     ctx = mg_start(&cb, &env, (const char **)&options);
 
     if (!ctx) {
@@ -965,7 +996,7 @@ int main(int argc, const char **argv)
   vector<const char *> def_args;
   def_args.push_back("--debug-rgw=1/5");
   def_args.push_back("--keyring=$rgw_data/keyring");
-  def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name");
+  def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name.log");
 
   vector<const char*> args;
   argv_to_vec(argc, argv, args);
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 7694748..4cc12ea 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -33,7 +33,8 @@ using ceph::crypto::MD5;
 static string mp_ns = RGW_OBJ_NS_MULTIPART;
 static string shadow_ns = RGW_OBJ_NS_SHADOW;
 
-#define MULTIPART_UPLOAD_ID_PREFIX "2/" // must contain a unique char that may not come up in gen_rand_alpha()
+#define MULTIPART_UPLOAD_ID_PREFIX_LEGACY "2/"
+#define MULTIPART_UPLOAD_ID_PREFIX "2~" // must contain a unique char that may not come up in gen_rand_alpha()
 
 class MultipartMetaFilter : public RGWAccessListFilter {
 public:
@@ -1438,7 +1439,8 @@ static bool is_v2_upload_id(const string& upload_id)
 {
   const char *uid = upload_id.c_str();
 
-  return (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX, sizeof(MULTIPART_UPLOAD_ID_PREFIX) - 1) == 0);
+  return (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX, sizeof(MULTIPART_UPLOAD_ID_PREFIX) - 1) == 0) ||
+         (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX_LEGACY, sizeof(MULTIPART_UPLOAD_ID_PREFIX_LEGACY) - 1) == 0);
 }
 
 int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs)
@@ -1524,64 +1526,18 @@ void RGWPutObj::pre_exec()
   rgw_bucket_object_pre_exec(s);
 }
 
-static int put_obj_user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs,
-                                       void *param)
-{
-  RGWPutObj *op = (RGWPutObj *)param;
-  return op->user_manifest_iterate_cb(bucket, ent, bucket_policy, start_ofs, end_ofs);
-}
-
-int RGWPutObj::user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs)
-{
-  rgw_obj part(bucket, ent.name);
-
-  map<string, bufferlist> attrs;
-
-  int ret = get_obj_attrs(store, s, part, attrs, NULL, NULL);
-  if (ret < 0) {
-    return ret;
-  }
-  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
-  if (iter == attrs.end()) {
-    return 0;
-  }
-  bufferlist& bl = iter->second;
-  const char *buf = bl.c_str();
-  int len = bl.length();
-  while (len > 0 && buf[len - 1] == '\0') {
-    len--;
-  }
-  if (len > 0) {
-    user_manifest_parts_hash->Update((const byte *)bl.c_str(), len);
-  }
-
-  if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
-    string e(bl.c_str(), bl.length());
-    ldout(s->cct, 20) << __func__ << ": appending user manifest etag: " << e << dendl;
-  }
-
-  return 0;
-}
-
 static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
                                  MD5 *hash, bool need_to_wait)
 {
-  const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
   bool again;
-  uint64_t len = data.length();
 
   do {
     void *handle;
 
-    int ret = processor->handle_data(data, ofs, &handle, &again);
+    int ret = processor->handle_data(data, ofs, hash, &handle, &again);
     if (ret < 0)
       return ret;
 
-    if (hash) {
-      hash->Update(data_ptr, len);
-      hash = NULL; /* only calculate hash once */
-    }
-
     ret = processor->throttle_data(handle, need_to_wait);
     if (ret < 0)
       return ret;
@@ -1719,6 +1675,7 @@ void RGWPutObj::execute()
   }
 
   if (need_calc_md5) {
+    processor->complete_hash(&hash);
     hash.Final(m);
 
     buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
@@ -1737,7 +1694,6 @@ void RGWPutObj::execute()
     bufferlist manifest_bl;
     string manifest_obj_prefix;
     string manifest_bucket;
-    RGWBucketInfo bucket_info;
 
     char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE];
     char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -1755,16 +1711,6 @@ void RGWPutObj::execute()
     manifest_bucket = prefix_str.substr(0, pos);
     manifest_obj_prefix = prefix_str.substr(pos + 1);
 
-    ret = store->get_bucket_info(NULL, manifest_bucket, bucket_info, NULL, NULL);
-    if (ret < 0) {
-      ldout(s->cct, 0) << "could not get bucket info for bucket=" << manifest_bucket << dendl;
-    }
-    ret = iterate_user_manifest_parts(s->cct, store, 0, -1, bucket_info.bucket, manifest_obj_prefix,
-                                      NULL, NULL, put_obj_user_manifest_iterate_cb, (void *)this);
-    if (ret < 0) {
-      goto done;
-    }
-
     hash.Final((byte *)etag_buf);
     buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE, etag_buf_str);
 
@@ -1940,10 +1886,14 @@ void RGWPutMetadata::execute()
   /* no need to track object versioning, need it for bucket's data only */
   RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker);
 
-  /* check if obj exists, read orig attrs */
-  ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker);
-  if (ret < 0)
-    return;
+  if (s->object) {
+    /* check if obj exists, read orig attrs */
+    ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker);
+    if (ret < 0)
+      return;
+  } else {
+    orig_attrs = s->bucket_attrs;
+  }
 
   /* only remove meta attrs */
   for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) {
@@ -2214,6 +2164,7 @@ void RGWCopyObj::execute()
                         replace_attrs,
                         attrs, RGW_OBJ_CATEGORY_MAIN,
                         &s->req_id, /* use req_id as tag */
+                        &etag,
                         &s->err,
                         copy_obj_progress_cb, (void *)this
                         );
@@ -2277,7 +2228,6 @@ void RGWPutACLs::execute()
   RGWAccessControlPolicy_S3 new_policy(s->cct);
   stringstream ss;
   char *new_data = NULL;
-  ACLOwner owner;
   rgw_obj obj;
 
   ret = 0;
@@ -2287,8 +2237,10 @@ void RGWPutACLs::execute()
     return;
   }
 
-  owner.set_id(s->user.user_id);
-  owner.set_name(s->user.display_name);
+
+  RGWAccessControlPolicy *existing_policy = (s->object == NULL? s->bucket_acl : s->object_acl);
+
+  owner = existing_policy->get_owner();
 
   ret = get_params();
   if (ret < 0)
@@ -2536,7 +2488,7 @@ void RGWInitMultipart::execute()
   do {
     char buf[33];
     gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1);
-    upload_id = "2/"; /* v2 upload id */
+    upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
     upload_id.append(buf);
 
     string tmp_obj_name;
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index b141ed5..bd6f964 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -345,8 +345,6 @@ public:
   RGWPutObjProcessor *select_processor(bool *is_multipart);
   void dispose_processor(RGWPutObjProcessor *processor);
 
-  int user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs);
-
   int verify_permission();
   void pre_exec();
   void execute();
@@ -490,6 +488,7 @@ protected:
   string source_zone;
   string client_id;
   string op_id;
+  string etag;
 
   off_t last_ofs;
 
@@ -556,6 +555,7 @@ protected:
   int ret;
   size_t len;
   char *data;
+  ACLOwner owner;
 
 public:
   RGWPutACLs() {
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index e22bef0..52f8a70 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -876,6 +876,11 @@ int RGWPutObjProcessor::complete(string& etag, time_t *mtime, time_t set_mtime,
   return 0;
 }
 
+CephContext *RGWPutObjProcessor::ctx()
+{
+  return store->ctx();
+}
+
 RGWPutObjProcessor::~RGWPutObjProcessor()
 {
   if (is_complete)
@@ -900,8 +905,10 @@ int RGWPutObjProcessor_Plain::prepare(RGWRados *store, void *obj_ctx, string *oi
   return 0;
 };
 
-int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
+int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, MD5 *hash, void **phandle, bool *again)
 {
+  assert(!hash);
+
   *again = false;
 
   if (ofs != _ofs)
@@ -1028,7 +1035,7 @@ int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phan
   return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
 }
 
-int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
+int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again)
 {
   *again = false;
 
@@ -1062,7 +1069,10 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **pha
   if (!data_ofs && !immutable_head()) {
     first_chunk.claim(bl);
     obj_len = (uint64_t)first_chunk.length();
-    int r = prepare_next_part(first_chunk.length());
+    if (hash) {
+      hash->Update((const byte *)first_chunk.c_str(), obj_len);
+    }
+    int r = prepare_next_part(obj_len);
     if (r < 0) {
       return r;
     }
@@ -1074,7 +1084,19 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **pha
   bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
                                                         we could be racing with another upload, to the same
                                                         object and cleanup can be messy */
-  return write_data(bl, write_ofs, phandle, exclusive);
+  int ret = write_data(bl, write_ofs, phandle, exclusive);
+  if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
+    if (hash) {
+      hash->Update((const byte *)bl.c_str(), bl.length());
+    }
+    bl.clear();
+  }
+  return ret;
+}
+
+void RGWPutObjProcessor_Atomic::complete_hash(MD5 *hash)
+{
+  hash->Update((const byte *)pending_data_bl.c_str(), pending_data_bl.length());
 }
 
 
@@ -3019,7 +3041,7 @@ public:
 
     do {
       void *handle;
-      int ret = processor->handle_data(bl, ofs, &handle, &again);
+      int ret = processor->handle_data(bl, ofs, NULL, &handle, &again);
       if (ret < 0)
         return ret;
 
@@ -3029,6 +3051,11 @@ public:
          */
         ret = opstate->renew_state();
         if (ret < 0) {
+          ldout(processor->ctx(), 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
+          int r = processor->throttle_data(handle, false);
+          if (r < 0) {
+            ldout(processor->ctx(), 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
+          }
           /* could not renew state! might have been marked as cancelled */
           return ret;
         }
@@ -3114,6 +3141,7 @@ int RGWRados::copy_obj(void *ctx,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                string *ptag,
+               string *petag,
                struct rgw_err *err,
                void (*progress_cb)(off_t, void *),
                void *progress_data)
@@ -3210,6 +3238,10 @@ int RGWRados::copy_obj(void *ctx,
     if (ret < 0)
       goto set_err_state;
 
+    if (petag) {
+      *petag = etag;
+    }
+
     { /* opening scope so that we can do goto, sorry */
       bufferlist& extra_data_bl = processor.get_extra_data();
       if (extra_data_bl.length()) {
@@ -3275,6 +3307,10 @@ set_err_state:
     if (ret < 0)
       return ret;
 
+    if (petag) {
+      *petag = etag;
+    }
+
     return 0;
   }
   
@@ -3286,7 +3322,7 @@ set_err_state:
     return ret;
   }
 
-  bool copy_data = !astate->has_manifest;
+  bool copy_data = !astate->has_manifest || (src_obj.bucket.data_pool != dest_obj.bucket.data_pool);
   bool copy_first = false;
   if (astate->has_manifest) {
     if (!astate->manifest.has_tail()) {
@@ -3304,7 +3340,7 @@ set_err_state:
   }
 
   if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
+    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, petag, err);
   }
 
   RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
@@ -3383,6 +3419,14 @@ set_err_state:
   if (mtime)
     obj_stat(ctx, dest_obj, NULL, mtime, NULL, NULL, NULL, NULL);
 
+  if (petag) {
+    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
+    if (iter != src_attrs.end()) {
+      bufferlist& etagbl = iter->second;
+      *petag = string(etagbl.c_str(), etagbl.length());
+    }
+  }
+
   return 0;
 
 done_ret:
@@ -3419,23 +3463,23 @@ int RGWRados::copy_obj_data(void *ctx,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                string *ptag,
+               string *petag,
                struct rgw_err *err)
 {
   bufferlist first_chunk;
   RGWObjManifest manifest;
   map<uint64_t, RGWObjManifestPart> objs;
-  RGWObjManifestPart *first_part;
-  map<string, bufferlist>::iterator iter;
 
-  rgw_obj shadow_obj = dest_obj;
-  string shadow_oid;
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
 
-  append_rand_alpha(cct, dest_obj.object, shadow_oid, 32);
-  shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
+  RGWPutObjProcessor_Atomic processor(owner, dest_obj.bucket, dest_obj.object,
+                                      cct->_conf->rgw_obj_stripe_size, tag);
+  int ret = processor.prepare(this, ctx, NULL);
+  if (ret < 0)
+    return ret;
 
-  int ret, r;
   off_t ofs = 0;
-  PutObjMetaExtraParams ep;
 
   do {
     bufferlist bl;
@@ -3443,55 +3487,40 @@ int RGWRados::copy_obj_data(void *ctx,
     if (ret < 0)
       return ret;
 
-    const char *data = bl.c_str();
+    uint64_t read_len = ret;
+    bool again;
 
-    if ((uint64_t)ofs < max_chunk_size) {
-      uint64_t len = min(max_chunk_size - ofs, (uint64_t)ret);
-      first_chunk.append(data, len);
-      ofs += len;
-      ret -= len;
-      data += len;
-    }
+    do {
+      void *handle;
 
-    // In the first call to put_obj_data, we pass ofs == -1 so that it will do
-    // a write_full, wiping out whatever was in the object before this
-    r = 0;
-    if (ret > 0) {
-      r = put_obj_data(ctx, shadow_obj, data, ((ofs == 0) ? -1 : ofs), ret, false);
-    }
-    if (r < 0)
-      goto done_err;
+      ret = processor.handle_data(bl, ofs, NULL, &handle, &again);
+      if (ret < 0) {
+        return ret;
+      }
+      ret = processor.throttle_data(handle, false);
+      if (ret < 0)
+        return ret;
+    } while (again);
 
-    ofs += ret;
+    ofs += read_len;
   } while (ofs <= end);
 
-  first_part = &objs[0];
-  first_part->loc = dest_obj;
-  first_part->loc_ofs = 0;
-  first_part->size = first_chunk.length();
-
-  if ((uint64_t)ofs > max_chunk_size) {
-    RGWObjManifestPart& tail = objs[max_chunk_size];
-    tail.loc = shadow_obj;
-    tail.loc_ofs = max_chunk_size;
-    tail.size = ofs - max_chunk_size;
+  string etag;
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
+  if (iter != attrs.end()) {
+    bufferlist& bl = iter->second;
+    etag = string(bl.c_str(), bl.length());
+    if (petag) {
+      *petag = etag;
+    }
   }
 
-  manifest.set_explicit(ofs, objs);
-
-  ep.data = &first_chunk;
-  ep.manifest = &manifest;
-  ep.ptag = ptag;
-  ep.owner = owner;
+  ret = processor.complete(etag, NULL, 0, attrs);
 
-  ret = put_obj_meta(ctx, dest_obj, end + 1, attrs, category, PUT_OBJ_CREATE, ep);
   if (mtime)
     obj_stat(ctx, dest_obj, NULL, mtime, NULL, NULL, NULL, NULL);
 
   return ret;
-done_err:
-  delete_obj(ctx, owner, shadow_obj);
-  return r;
 }
 
 /**
@@ -4132,7 +4161,33 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj,
   if (!op.size())
     return 0;
 
+  string tag;
+  if (state) {
+    r = prepare_update_index(state, bucket, CLS_RGW_OP_ADD, obj, tag);
+    if (r < 0)
+      return r;
+  }
+
   r = ref.ioctx.operate(ref.oid, &op);
+  if (state) {
+    if (r >= 0) {
+      bufferlist acl_bl = attrs[RGW_ATTR_ACL];
+      bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
+      bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
+      string etag(etag_bl.c_str(), etag_bl.length());
+      string content_type(content_type_bl.c_str(), content_type_bl.length());
+      uint64_t epoch = ref.ioctx.get_last_version();
+      int64_t poolid = ref.ioctx.get_id();
+      utime_t mtime = ceph_clock_now(cct);
+      r = complete_update_index(bucket, obj.object, tag, poolid, epoch, state->size,
+                                mtime, etag, content_type, &acl_bl, RGW_OBJ_CATEGORY_MAIN, NULL);
+    } else {
+      int ret = complete_update_index_cancel(bucket, obj.object, tag);
+      if (ret < 0) {
+        ldout(cct, 0) << "ERROR: comlete_update_index_cancel() returned r=" << r << dendl;
+      }
+    }
+  }
   if (r < 0)
     return r;
 
@@ -4636,7 +4691,7 @@ int RGWRados::get_obj(void *ctx, RGWObjVersionTracker *objv_tracker, void **hand
     bl.append(read_bl);
 
 done:
-  if (bl.length() > 0) {
+  if (r >= 0) {
     r = bl.length();
   }
   if (r < 0 || !len || ((off_t)(ofs + len - 1) == end)) {
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index d811b49..ff161b8 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -298,6 +298,11 @@ public:
 
   bool has_tail() {
     if (explicit_objs) {
+      if (objs.size() == 1) {
+        map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+        rgw_obj& obj = iter->second.loc;
+        return head_obj.object != obj.object;
+      }
       return (objs.size() >= 2);
     }
     return (obj_size > head_size);
@@ -548,9 +553,14 @@ public:
     obj_ctx = _o;
     return 0;
   };
-  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
+  virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again) = 0;
   virtual int throttle_data(void *handle, bool need_to_wait) = 0;
+  virtual void complete_hash(MD5 *hash) {
+    assert(0);
+  }
   virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
+
+  CephContext *ctx();
 };
 
 class RGWPutObjProcessor_Plain : public RGWPutObjProcessor
@@ -564,7 +574,7 @@ class RGWPutObjProcessor_Plain : public RGWPutObjProcessor
 
 protected:
   int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
-  int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
+  int handle_data(bufferlist& bl, off_t ofs, MD5 *hash /* NULL expected */, void **phandle, bool *again);
   int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
 
 public:
@@ -654,7 +664,8 @@ public:
   void set_extra_data_len(uint64_t len) {
     extra_data_len = len;
   }
-  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
+  virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again);
+  virtual void complete_hash(MD5 *hash);
   bufferlist& get_extra_data() { return extra_data_bl; }
 };
 
@@ -1555,6 +1566,7 @@ public:
                map<std::string, bufferlist>& attrs,
                RGWObjCategory category,
                string *ptag,
+               string *petag,
                struct rgw_err *err,
                void (*progress_cb)(off_t, void *),
                void *progress_data);
@@ -1569,6 +1581,7 @@ public:
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                string *ptag,
+               string *petag,
                struct rgw_err *err);
   /**
    * Delete a bucket.
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index b74002d..59026fb 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -244,7 +244,7 @@ void dump_content_length(struct req_state *s, uint64_t len)
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
-  r = s->cio->print("Accept-Ranges: %s\n", "bytes");
+  r = s->cio->print("Accept-Ranges: %s\r\n", "bytes");
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
@@ -254,9 +254,9 @@ void dump_etag(struct req_state *s, const char *etag)
 {
   int r;
   if (s->prot_flags & RGW_REST_SWIFT)
-    r = s->cio->print("etag: %s\n", etag);
+    r = s->cio->print("etag: %s\r\n", etag);
   else
-    r = s->cio->print("ETag: \"%s\"\n", etag);
+    r = s->cio->print("ETag: \"%s\"\r\n", etag);
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
@@ -265,7 +265,7 @@ void dump_etag(struct req_state *s, const char *etag)
 void dump_pair(struct req_state *s, const char *key, const char *value)
 {
   if ( (strlen(key) > 0) && (strlen(value) > 0))
-    s->cio->print("%s: %s\n", key, value);
+    s->cio->print("%s: %s\r\n", key, value);
 }
 
 void dump_bucket_from_state(struct req_state *s)
@@ -273,16 +273,10 @@ void dump_bucket_from_state(struct req_state *s)
   int expose_bucket = g_conf->rgw_expose_bucket;
   if (expose_bucket) {
     if (!s->bucket_name_str.empty())
-      s->cio->print("Bucket: \"%s\"\n", s->bucket_name_str.c_str());
+      s->cio->print("Bucket: \"%s\"\r\n", s->bucket_name_str.c_str());
   }
 }
 
-void dump_object_from_state(struct req_state *s)
-{
-  if (!s->object_str.empty())
-    s->cio->print("Key: \"%s\"\n", s->object_str.c_str());
-}
-
 void dump_uri_from_state(struct req_state *s)
 {
   if (strcmp(s->info.request_uri.c_str(), "/") == 0) {
@@ -296,12 +290,12 @@ void dump_uri_from_state(struct req_state *s)
       location += "/";
       if (!s->object_str.empty()) {
         location += s->object_str;
-        s->cio->print("Location: %s\n", location.c_str());
+        s->cio->print("Location: %s\r\n", location.c_str());
       }
     }
   }
   else {
-    s->cio->print("Location: \"%s\"\n", s->info.request_uri.c_str());
+    s->cio->print("Location: \"%s\"\r\n", s->info.request_uri.c_str());
   }
 }
 
@@ -310,7 +304,7 @@ void dump_redirect(struct req_state *s, const string& redirect)
   if (redirect.empty())
     return;
 
-  s->cio->print("Location: %s\n", redirect.c_str());
+  s->cio->print("Location: %s\r\n", redirect.c_str());
 }
 
 static void dump_time_header(struct req_state *s, const char *name, time_t t)
@@ -325,7 +319,7 @@ static void dump_time_header(struct req_state *s, const char *name, time_t t)
   if (strftime(timestr, sizeof(timestr), "%a, %d %b %Y %H:%M:%S %Z", tmp) == 0)
     return;
 
-  int r = s->cio->print("%s: %s\n", name, timestr);
+  int r = s->cio->print("%s: %s\r\n", name, timestr);
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
@@ -341,7 +335,7 @@ void dump_epoch_header(struct req_state *s, const char *name, time_t t)
   char buf[32];
   snprintf(buf, sizeof(buf), "%lld", (long long)t);
 
-  int r = s->cio->print("%s: %s\n", name, buf);
+  int r = s->cio->print("%s: %s\r\n", name, buf);
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
@@ -374,16 +368,16 @@ void dump_owner(struct req_state *s, string& id, string& name, const char *secti
 void dump_access_control(struct req_state *s, const char *origin, const char *meth,
                          const char *hdr, const char *exp_hdr, uint32_t max_age) {
   if (origin && (origin[0] != '\0')) {
-    s->cio->print("Access-Control-Allow-Origin: %s\n", origin);
+    s->cio->print("Access-Control-Allow-Origin: %s\r\n", origin);
     if (meth && (meth[0] != '\0'))
-      s->cio->print("Access-Control-Allow-Methods: %s\n", meth);
+      s->cio->print("Access-Control-Allow-Methods: %s\r\n", meth);
     if (hdr && (hdr[0] != '\0'))
-      s->cio->print("Access-Control-Allow-Headers: %s\n", hdr);
+      s->cio->print("Access-Control-Allow-Headers: %s\r\n", hdr);
     if (exp_hdr && (exp_hdr[0] != '\0')) {
-      s->cio->print("Access-Control-Expose-Headers: %s\n", exp_hdr);
+      s->cio->print("Access-Control-Expose-Headers: %s\r\n", exp_hdr);
     }
     if (max_age != CORS_MAX_AGE_INVALID) {
-      s->cio->print("Access-Control-Max-Age: %d\n", max_age);
+      s->cio->print("Access-Control-Max-Age: %d\r\n", max_age);
     }
   }
 }
@@ -483,7 +477,7 @@ void dump_range(struct req_state *s, uint64_t ofs, uint64_t end, uint64_t total)
 
   /* dumping range into temp buffer first, as libfcgi will fail to digest %lld */
   snprintf(range_buf, sizeof(range_buf), "%lld-%lld/%lld", (long long)ofs, (long long)end, (long long)total);
-  int r = s->cio->print("Content-Range: bytes %s\n", range_buf);
+  int r = s->cio->print("Content-Range: bytes %s\r\n", range_buf);
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index 38ffd8c..d42ec8d 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -369,7 +369,6 @@ extern void dump_continue(struct req_state *s);
 extern void list_all_buckets_end(struct req_state *s);
 extern void dump_time(struct req_state *s, const char *name, time_t *t);
 extern void dump_bucket_from_state(struct req_state *s);
-extern void dump_object_from_state(struct req_state *s);
 extern void dump_uri_from_state(struct req_state *s);
 extern void dump_redirect(struct req_state *s, const string& redirect);
 extern void dump_pair(struct req_state *s, const char *key, const char *value);
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index c7961f4..6fcecf7 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -161,7 +161,7 @@ done:
   dump_errno(s);
 
   for (riter = response_attrs.begin(); riter != response_attrs.end(); ++riter) {
-    s->cio->print("%s: %s\n", riter->first.c_str(), riter->second.c_str());
+    s->cio->print("%s: %s\r\n", riter->first.c_str(), riter->second.c_str());
   }
 
   if (!content_type)
@@ -303,9 +303,9 @@ static void dump_bucket_metadata(struct req_state *s, RGWBucketEnt& bucket)
 {
   char buf[32];
   snprintf(buf, sizeof(buf), "%lld", (long long)bucket.count);
-  s->cio->print("X-RGW-Object-Count: %s\n", buf);
+  s->cio->print("X-RGW-Object-Count: %s\r\n", buf);
   snprintf(buf, sizeof(buf), "%lld", (long long)bucket.size);
-  s->cio->print("X-RGW-Bytes-Used: %s\n", buf);
+  s->cio->print("X-RGW-Bytes-Used: %s\r\n", buf);
 }
 
 void RGWStatBucket_ObjStore_S3::send_response()
@@ -321,16 +321,16 @@ void RGWStatBucket_ObjStore_S3::send_response()
   dump_start(s);
 }
 
-static int create_s3_policy(struct req_state *s, RGWRados *store, RGWAccessControlPolicy_S3& s3policy)
+static int create_s3_policy(struct req_state *s, RGWRados *store, RGWAccessControlPolicy_S3& s3policy, ACLOwner& owner)
 {
   if (s->has_acl_header) {
     if (!s->canned_acl.empty())
       return -ERR_INVALID_REQUEST;
 
-    return s3policy.create_from_headers(store, s->info.env, s->owner);
+    return s3policy.create_from_headers(store, s->info.env, owner);
   }
 
-  return s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl);
+  return s3policy.create_canned(owner, s->bucket_owner, s->canned_acl);
 }
 
 class RGWLocationConstraint : public XMLObj
@@ -386,7 +386,7 @@ int RGWCreateBucket_ObjStore_S3::get_params()
 {
   RGWAccessControlPolicy_S3 s3policy(s->cct);
 
-  int r = create_s3_policy(s, store, s3policy);
+  int r = create_s3_policy(s, store, s3policy, s->owner);
   if (r < 0)
     return r;
 
@@ -487,7 +487,7 @@ int RGWPutObj_ObjStore_S3::get_params()
   if (!s->length)
     return -ERR_LENGTH_REQUIRED;
 
-  int r = create_s3_policy(s, store, s3policy);
+  int r = create_s3_policy(s, store, s3policy, s->owner);
   if (r < 0)
     return r;
 
@@ -1198,7 +1198,7 @@ int RGWCopyObj_ObjStore_S3::init_dest_policy()
   RGWAccessControlPolicy_S3 s3policy(s->cct);
 
   /* build a policy for the target object */
-  int r = create_s3_policy(s, store, s3policy);
+  int r = create_s3_policy(s, store, s3policy, s->owner);
   if (r < 0)
     return r;
 
@@ -1264,7 +1264,7 @@ void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs)
     set_req_state_err(s, ret);
     dump_errno(s);
 
-    end_header(s, this, "binary/octet-stream");
+    end_header(s, this, "application/xml");
     if (ret == 0) {
       s->formatter->open_object_section("CopyObjectResult");
     }
@@ -1285,13 +1285,8 @@ void RGWCopyObj_ObjStore_S3::send_response()
 
   if (ret == 0) {
     dump_time(s, "LastModified", &mtime);
-    map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
-    if (iter != attrs.end()) {
-      bufferlist& bl = iter->second;
-      if (bl.length()) {
-        char *etag = bl.c_str();
-        s->formatter->dump_string("ETag", etag);
-      }
+    if (!etag.empty()) {
+      s->formatter->dump_string("ETag", etag);
     }
     s->formatter->close_section();
     rgw_flush_formatter_and_reset(s, s->formatter);
@@ -1318,7 +1313,7 @@ int RGWPutACLs_ObjStore_S3::get_policy_from_state(RGWRados *store, struct req_st
       s->canned_acl.clear();
   }
 
-  int r = create_s3_policy(s, store, s3policy);
+  int r = create_s3_policy(s, store, s3policy, owner);
   if (r < 0)
     return r;
 
@@ -1460,7 +1455,7 @@ void RGWOptionsCORS_ObjStore_S3::send_response()
 int RGWInitMultipart_ObjStore_S3::get_params()
 {
   RGWAccessControlPolicy_S3 s3policy(s->cct);
-  ret = create_s3_policy(s, store, s3policy);
+  ret = create_s3_policy(s, store, s3policy, s->owner);
   if (ret < 0)
     return ret;
 
@@ -2047,6 +2042,12 @@ int RGW_Auth_S3_Keystone_ValidateToken::validate_s3token(const string& auth_id,
   return 0;
 }
 
+static void init_anon_user(struct req_state *s)
+{
+  rgw_get_anon_user(s->user);
+  s->perm_mask = RGW_PERM_FULL_CONTROL;
+}
+
 /*
  * verify that a signed request comes from the keyholder
  * by checking the signature against our locally-computed version
@@ -2067,6 +2068,11 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
     return -EPERM;
   }
 
+  if (s->op == OP_OPTIONS) {
+    init_anon_user(s);
+    return 0;
+  }
+
   if (!s->http_auth || !(*s->http_auth)) {
     auth_id = s->info.args.get("AWSAccessKeyId");
     if (auth_id.size()) {
@@ -2080,8 +2086,7 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s)
       qsr = true;
     } else {
       /* anonymous access */
-      rgw_get_anon_user(s->user);
-      s->perm_mask = RGW_PERM_FULL_CONTROL;
+      init_anon_user(s);
       return 0;
     }
   } else {
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index b562079..36544db 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -223,11 +223,11 @@ static void dump_container_metadata(struct req_state *s, RGWBucketEnt& bucket)
 {
   char buf[32];
   snprintf(buf, sizeof(buf), "%lld", (long long)bucket.count);
-  s->cio->print("X-Container-Object-Count: %s\n", buf);
+  s->cio->print("X-Container-Object-Count: %s\r\n", buf);
   snprintf(buf, sizeof(buf), "%lld", (long long)bucket.size);
-  s->cio->print("X-Container-Bytes-Used: %s\n", buf);
+  s->cio->print("X-Container-Bytes-Used: %s\r\n", buf);
   snprintf(buf, sizeof(buf), "%lld", (long long)bucket.size_rounded);
-  s->cio->print("X-Container-Bytes-Used-Actual: %s\n", buf);
+  s->cio->print("X-Container-Bytes-Used-Actual: %s\r\n", buf);
 
   if (!s->object) {
     RGWAccessControlPolicy_SWIFT *swift_policy = static_cast<RGWAccessControlPolicy_SWIFT *>(s->bucket_acl);
@@ -247,13 +247,13 @@ static void dump_account_metadata(struct req_state *s, uint32_t buckets_count,
 {
   char buf[32];
   snprintf(buf, sizeof(buf), "%lld", (long long)buckets_count);
-  s->cio->print("X-Account-Container-Count: %s\n", buf);
+  s->cio->print("X-Account-Container-Count: %s\r\n", buf);
   snprintf(buf, sizeof(buf), "%lld", (long long)buckets_object_count);
-  s->cio->print("X-Account-Object-Count: %s\n", buf);
+  s->cio->print("X-Account-Object-Count: %s\r\n", buf);
   snprintf(buf, sizeof(buf), "%lld", (long long)buckets_size);
-  s->cio->print("X-Account-Bytes-Used: %s\n", buf);
+  s->cio->print("X-Account-Bytes-Used: %s\r\n", buf);
   snprintf(buf, sizeof(buf), "%lld", (long long)buckets_size_rounded);
-  s->cio->print("X-Account-Bytes-Used-Actual: %s\n", buf);
+  s->cio->print("X-Account-Bytes-Used-Actual: %s\r\n", buf);
 }
 
 void RGWStatAccount_ObjStore_SWIFT::send_response()
@@ -552,7 +552,6 @@ void RGWCopyObj_ObjStore_SWIFT::send_response()
 int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len)
 {
   const char *content_type = NULL;
-  int orig_ret = ret;
   map<string, string> response_attrs;
   map<string, string>::iterator riter;
 
@@ -593,15 +592,11 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
     }
   }
 
-  if (partial_content && !ret)
-    ret = -STATUS_PARTIAL_CONTENT;
-
-  if (ret)
-    set_req_state_err(s, ret);
+  set_req_state_err(s, (partial_content && !ret) ? STATUS_PARTIAL_CONTENT : ret);
   dump_errno(s);
 
   for (riter = response_attrs.begin(); riter != response_attrs.end(); ++riter) {
-    s->cio->print("%s: %s\n", riter->first.c_str(), riter->second.c_str());
+    s->cio->print("%s: %s\r\n", riter->first.c_str(), riter->second.c_str());
   }
 
   if (!content_type)
@@ -611,7 +606,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
   sent_header = true;
 
 send_data:
-  if (get_data && !orig_ret) {
+  if (get_data && !ret) {
     int r = s->cio->write(bl.c_str() + bl_ofs, bl_len);
     if (r < 0)
       return r;
@@ -776,7 +771,8 @@ RGWOp *RGWHandler_ObjStore_Obj_SWIFT::op_options()
 
 int RGWHandler_ObjStore_SWIFT::authorize()
 {
-  if (!s->os_auth_token && s->info.args.get("temp_url_sig").empty()) {
+  if ((!s->os_auth_token && s->info.args.get("temp_url_sig").empty()) ||
+      (s->op == OP_OPTIONS)) {
     /* anonymous access */
     rgw_get_anon_user(s->user);
     s->perm_mask = RGW_PERM_FULL_CONTROL;
@@ -787,8 +783,6 @@ int RGWHandler_ObjStore_SWIFT::authorize()
   if (!authorized)
     return -EPERM;
 
-  s->perm_mask = RGW_PERM_FULL_CONTROL;
-
   return 0;
 }
 
diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc
index d9654a7..46c45bd 100644
--- a/src/rgw/rgw_swift.cc
+++ b/src/rgw/rgw_swift.cc
@@ -505,6 +505,8 @@ int RGWSwift::validate_keystone_token(RGWRados *store, const string& token, stru
 
     validate.append_header("X-Auth-Token", admin_token);
 
+    validate.set_send_length(0);
+
     int ret = validate.process(url.c_str());
     if (ret < 0)
       return ret;
@@ -609,13 +611,41 @@ int authenticate_temp_url(RGWRados *store, req_state *s)
 
 bool RGWSwift::verify_swift_token(RGWRados *store, req_state *s)
 {
+  if (!do_verify_swift_token(store, s)) {
+    return false;
+  }
+
+  if (!s->swift_user.empty()) {
+    string subuser;
+    ssize_t pos = s->swift_user.find(':');
+    if (pos < 0) {
+      subuser = s->swift_user;
+    } else {
+      subuser = s->swift_user.substr(pos + 1);
+    }
+    s->perm_mask = 0;
+    map<string, RGWSubUser>::iterator iter = s->user.subusers.find(subuser);
+    if (iter != s->user.subusers.end()) {
+      RGWSubUser& subuser = iter->second;
+      s->perm_mask = subuser.perm_mask;
+    }
+  } else {
+    s->perm_mask = RGW_PERM_FULL_CONTROL;
+  }
+
+  return true;
+
+}
+
+bool RGWSwift::do_verify_swift_token(RGWRados *store, req_state *s)
+{
   if (!s->os_auth_token) {
     int ret = authenticate_temp_url(store, s);
     return (ret >= 0);
   }
 
   if (strncmp(s->os_auth_token, "AUTH_rgwtk", 10) == 0) {
-    int ret = rgw_swift_verify_signed_token(s->cct, store, s->os_auth_token, s->user);
+    int ret = rgw_swift_verify_signed_token(s->cct, store, s->os_auth_token, s->user, &s->swift_user);
     if (ret < 0)
       return false;
 
diff --git a/src/rgw/rgw_swift.h b/src/rgw/rgw_swift.h
index 97347e8..300b5eb 100644
--- a/src/rgw/rgw_swift.h
+++ b/src/rgw/rgw_swift.h
@@ -53,6 +53,7 @@ class RGWSwift {
   bool supports_keystone() {
     return !cct->_conf->rgw_keystone_url.empty();
   }
+  bool do_verify_swift_token(RGWRados *store, req_state *s);
 protected:
   int check_revoked();
 public:
diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc
index 9c800c4..1384bb0 100644
--- a/src/rgw/rgw_swift_auth.cc
+++ b/src/rgw/rgw_swift_auth.cc
@@ -56,7 +56,7 @@ static int encode_token(CephContext *cct, string& swift_user, string& key, buffe
   return ret;
 }
 
-int rgw_swift_verify_signed_token(CephContext *cct, RGWRados *store, const char *token, RGWUserInfo& info)
+int rgw_swift_verify_signed_token(CephContext *cct, RGWRados *store, const char *token, RGWUserInfo& info, string *pswift_user)
 {
   if (strncmp(token, "AUTH_rgwtk", 10) != 0)
     return -EINVAL;
@@ -123,6 +123,7 @@ int rgw_swift_verify_signed_token(CephContext *cct, RGWRados *store, const char
     dout(0) << "NOTICE: tokens mismatch tok=" << buf << dendl;
     return -EPERM;
   }
+  *pswift_user = swift_user;
 
   return 0;
 }
@@ -205,7 +206,7 @@ void RGW_SWIFT_Auth_Get::execute()
     tenant_path.append(g_conf->rgw_swift_tenant_name);
   }
 
-  s->cio->print("X-Storage-Url: %s/%s/v1%s\n", swift_url.c_str(),
+  s->cio->print("X-Storage-Url: %s/%s/v1%s\r\n", swift_url.c_str(),
 	        swift_prefix.c_str(), tenant_path.c_str());
 
   if ((ret = encode_token(s->cct, swift_key->id, swift_key->key, bl)) < 0)
@@ -215,8 +216,8 @@ void RGW_SWIFT_Auth_Get::execute()
     char buf[bl.length() * 2 + 1];
     buf_to_hex((const unsigned char *)bl.c_str(), bl.length(), buf);
 
-    s->cio->print("X-Storage-Token: AUTH_rgwtk%s\n", buf);
-    s->cio->print("X-Auth-Token: AUTH_rgwtk%s\n", buf);
+    s->cio->print("X-Storage-Token: AUTH_rgwtk%s\r\n", buf);
+    s->cio->print("X-Auth-Token: AUTH_rgwtk%s\r\n", buf);
   }
 
   ret = STATUS_NO_CONTENT;
diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h
index 6d50d94..61050d8 100644
--- a/src/rgw/rgw_swift_auth.h
+++ b/src/rgw/rgw_swift_auth.h
@@ -6,7 +6,7 @@
 
 #define RGW_SWIFT_TOKEN_EXPIRATION (15 * 60)
 
-extern int rgw_swift_verify_signed_token(CephContext *cct, RGWRados *store, const char *token, RGWUserInfo& info);
+extern int rgw_swift_verify_signed_token(CephContext *cct, RGWRados *store, const char *token, RGWUserInfo& info, string *pswift_user);
 
 class RGW_SWIFT_Auth_Get : public RGWOp {
 public:
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 23575d8..55d1d6a 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -666,15 +666,6 @@ bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state)
 
   switch (key_type) {
   case KEY_TYPE_SWIFT:
-    kiter = swift_keys->find(kid);
-
-    existing_key = (kiter != swift_keys->end());
-    if (existing_key)
-      break;
-
-    if (swift_kid.empty())
-      return false;
-
     kiter = swift_keys->find(swift_kid);
 
     existing_key = (kiter != swift_keys->end());
@@ -845,7 +836,7 @@ int RGWAccessKeyPool::generate_key(RGWUserAdminOpState& op_state, std::string *e
     } while (!rgw_get_user_info_by_access_key(store, id, duplicate_check));
   }
 
-  if (key_type == KEY_TYPE_SWIFT && gen_access) {
+  if (key_type == KEY_TYPE_SWIFT) {
     id = op_state.build_default_swift_kid();
     if (id.empty()) {
       set_err_msg(err_msg, "empty swift access key");
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index f527b96..69f9e84 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -368,6 +368,9 @@ unittest_pglog_SOURCES = test/osd/TestPGLog.cc
 unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_pglog_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 check_PROGRAMS += unittest_pglog
+if LINUX
+unittest_pglog_LDADD += -ldl
+endif # LINUX
 
 unittest_ecbackend_SOURCES = test/osd/TestECBackend.cc
 unittest_ecbackend_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -379,9 +382,10 @@ unittest_hitset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_hitset_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 check_PROGRAMS += unittest_hitset
 
-if LINUX
-unittest_pglog_LDADD += -ldl
-endif # LINUX
+unittest_io_priority_SOURCES = test/common/test_io_priority.cc
+unittest_io_priority_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_io_priority_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_io_priority
 
 unittest_gather_SOURCES = test/gather.cc
 unittest_gather_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
diff --git a/src/test/cli/crushtool/add-item-in-tree.t b/src/test/cli/crushtool/add-item-in-tree.t
new file mode 100644
index 0000000..8790977
--- /dev/null
+++ b/src/test/cli/crushtool/add-item-in-tree.t
@@ -0,0 +1,10 @@
+  $ crushtool -i "$TESTDIR/tree.template" --add-item 0 1.0 device0 --loc host host0 --loc cluster cluster0 -o one > /dev/null
+  $ crushtool -i one   --add-item 1 1.0 device1 --loc host host0 --loc cluster cluster0 -o two   > /dev/null
+  $ crushtool -i two   --add-item 2 1.0 device2 --loc host host0 --loc cluster cluster0 -o tree  > /dev/null
+  $ crushtool -i tree  --add-item 3 1.0 device3 --loc host host0 --loc cluster cluster0 -o four  > /dev/null
+  $ crushtool -i four  --add-item 4 1.0 device4 --loc host host0 --loc cluster cluster0 -o five  > /dev/null
+  $ crushtool -i five  --add-item 5 1.0 device5 --loc host host0 --loc cluster cluster0 -o six   > /dev/null
+  $ crushtool -i six   --add-item 6 1.0 device6 --loc host host0 --loc cluster cluster0 -o seven > /dev/null
+  $ crushtool -i seven --add-item 7 1.0 device7 --loc host host0 --loc cluster cluster0 -o eight > /dev/null
+  $ crushtool -d eight -o final
+  $ cmp final "$TESTDIR/tree.template.final"
diff --git a/src/test/cli/crushtool/adjust-item-weight.t b/src/test/cli/crushtool/adjust-item-weight.t
new file mode 100644
index 0000000..16d7135
--- /dev/null
+++ b/src/test/cli/crushtool/adjust-item-weight.t
@@ -0,0 +1,17 @@
+  $ crushtool -i "$TESTDIR/simple.template" --add-item 0 1.0 device0 --loc host host0 --loc cluster cluster0 -o one > /dev/null
+
+#
+# add device0 into host=fake, the weight of device0 in host=host0 is 1.0, the weight of device0 in host=fake is 2.0
+#
+
+  $ crushtool -i one --add-item 0 2.0 device0 --loc host fake --loc cluster cluster0 -o two > /dev/null
+  $ crushtool -d two -o final
+  $ cmp final "$TESTDIR/simple.template.adj.two"
+
+#
+# update the weight of device0 in host=host0, it will not affect the weight of device0 in host=fake
+#
+
+  $ crushtool -i two --update-item 0 3.0 device0 --loc host host0 --loc cluster cluster0 -o three > /dev/null
+  $ crushtool -d three -o final
+  $ cmp final "$TESTDIR/simple.template.adj.three"
diff --git a/src/test/cli/crushtool/build.t b/src/test/cli/crushtool/build.t
index ca0804d..f016737 100644
--- a/src/test/cli/crushtool/build.t
+++ b/src/test/cli/crushtool/build.t
@@ -52,7 +52,7 @@
 #
 # crush rulesets are generated using the OSDMap helpers
 #
-  $ CEPH_ARGS="--debug-crush 0" crushtool --outfn "$map" --build --num_osds 1 root straw 0
+  $ CEPH_ARGS="--debug-crush 0" crushtool --outfn "$map" --set-straw-calc-version 0 --build --num_osds 1 root straw 0
   $ crushtool -o "$map.txt" -d "$map"
   $ cat "$map.txt"
   # begin crush map
diff --git a/src/test/cli/crushtool/help.t b/src/test/cli/crushtool/help.t
index 3b48930..a9c6222 100644
--- a/src/test/cli/crushtool/help.t
+++ b/src/test/cli/crushtool/help.t
@@ -33,6 +33,7 @@
      --show utilization-all
                            include zero weight items
      --show-statistics     show chi squared statistics
+     --show-mappings       show mappings
      --show-bad-mappings   show bad mappings
      --show-choose-tries   show choose tries histogram
      --set-choose-local-tries N
diff --git a/src/test/cli/crushtool/set-choose.t b/src/test/cli/crushtool/set-choose.t
index e160ad7..b40494d 100644
--- a/src/test/cli/crushtool/set-choose.t
+++ b/src/test/cli/crushtool/set-choose.t
@@ -1,5 +1,6 @@
   $ crushtool -c "$TESTDIR/set-choose.crushmap.txt" -o set-choose.crushmap
-  $ crushtool -i set-choose.crushmap --test --show-statistics
+  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --set-straw-calc-version 0
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (choose), x = 0..1023, numrep = 2..3
   CRUSH rule 0 x 0 [0,3]
   CRUSH rule 0 x 1 [0,8]
@@ -12306,7 +12307,8 @@
   CRUSH rule 5 x 1022 [1,6,4]
   CRUSH rule 5 x 1023 [3,2,8]
   rule 5 (chooseleaf-set) num_rep 3 result size == 3:\t1024/1024 (esc)
-  $ crushtool -i set-choose.crushmap --test --show-statistics --weight 0 0 --weight 1 0 --weight 3 0 --weight 4 0
+  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --weight 0 0 --weight 1 0 --weight 3 0 --weight 4 0 --set-straw-calc-version 0
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (choose), x = 0..1023, numrep = 2..3
   CRUSH rule 0 x 0 [2,5]
   CRUSH rule 0 x 1 [2,8]
@@ -24618,7 +24620,8 @@
   CRUSH rule 5 x 1022 [2,6,5]
   CRUSH rule 5 x 1023 [5,2,8]
   rule 5 (chooseleaf-set) num_rep 3 result size == 3:\t1024/1024 (esc)
-  $ crushtool -i set-choose.crushmap --test --show-statistics --weight 0 0 --weight 3 0 --weight 4 .5 --weight 5 0 --weight 6 .1 --weight 7 0
+  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --weight 0 0 --weight 3 0 --weight 4 .5 --weight 5 0 --weight 6 .1 --weight 7 0 --set-straw-calc-version 0
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (choose), x = 0..1023, numrep = 2..3
   CRUSH rule 0 x 0 [2,4]
   CRUSH rule 0 x 1 [2,8]
diff --git a/src/test/cli/crushtool/simple.template.adj.one b/src/test/cli/crushtool/simple.template.adj.one
new file mode 100644
index 0000000..aa16bbd
--- /dev/null
+++ b/src/test/cli/crushtool/simple.template.adj.one
@@ -0,0 +1,56 @@
+# begin crush map
+
+# devices
+device 0 device0
+
+# types
+type 0 device
+type 1 host
+type 2 cluster
+
+# buckets
+host host0 {
+	id -2		# do not change unnecessarily
+	# weight 1.000
+	alg straw
+	hash 0	# rjenkins1
+	item device0 weight 1.000
+}
+cluster cluster0 {
+	id -1		# do not change unnecessarily
+	# weight 1.000
+	alg straw
+	hash 0	# rjenkins1
+	item host0 weight 1.000
+}
+
+# rules
+rule data {
+	ruleset 0
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule metadata {
+	ruleset 1
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule rbd {
+	ruleset 2
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+
+# end crush map
diff --git a/src/test/cli/crushtool/simple.template.adj.three b/src/test/cli/crushtool/simple.template.adj.three
new file mode 100644
index 0000000..fca0fe1
--- /dev/null
+++ b/src/test/cli/crushtool/simple.template.adj.three
@@ -0,0 +1,64 @@
+# begin crush map
+
+# devices
+device 0 device0
+
+# types
+type 0 device
+type 1 host
+type 2 cluster
+
+# buckets
+host host0 {
+	id -2		# do not change unnecessarily
+	# weight 3.000
+	alg straw
+	hash 0	# rjenkins1
+	item device0 weight 3.000
+}
+host fake {
+	id -3		# do not change unnecessarily
+	# weight 2.000
+	alg straw
+	hash 0	# rjenkins1
+	item device0 weight 2.000
+}
+cluster cluster0 {
+	id -1		# do not change unnecessarily
+	# weight 5.000
+	alg straw
+	hash 0	# rjenkins1
+	item host0 weight 3.000
+	item fake weight 2.000
+}
+
+# rules
+rule data {
+	ruleset 0
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule metadata {
+	ruleset 1
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule rbd {
+	ruleset 2
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+
+# end crush map
diff --git a/src/test/cli/crushtool/simple.template.adj.two b/src/test/cli/crushtool/simple.template.adj.two
new file mode 100644
index 0000000..21c29a3
--- /dev/null
+++ b/src/test/cli/crushtool/simple.template.adj.two
@@ -0,0 +1,64 @@
+# begin crush map
+
+# devices
+device 0 device0
+
+# types
+type 0 device
+type 1 host
+type 2 cluster
+
+# buckets
+host host0 {
+	id -2		# do not change unnecessarily
+	# weight 1.000
+	alg straw
+	hash 0	# rjenkins1
+	item device0 weight 1.000
+}
+host fake {
+	id -3		# do not change unnecessarily
+	# weight 2.000
+	alg straw
+	hash 0	# rjenkins1
+	item device0 weight 2.000
+}
+cluster cluster0 {
+	id -1		# do not change unnecessarily
+	# weight 3.000
+	alg straw
+	hash 0	# rjenkins1
+	item host0 weight 1.000
+	item fake weight 2.000
+}
+
+# rules
+rule data {
+	ruleset 0
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule metadata {
+	ruleset 1
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule rbd {
+	ruleset 2
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+
+# end crush map
diff --git a/src/test/cli/crushtool/test-map-bobtail-tunables.t b/src/test/cli/crushtool/test-map-bobtail-tunables.t
index 7c38260..77f2ce8 100644
--- a/src/test/cli/crushtool/test-map-bobtail-tunables.t
+++ b/src/test/cli/crushtool/test-map-bobtail-tunables.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-a.crushmap" --test --show-statistics --rule 0 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 1
+  $ crushtool -i "$TESTDIR/test-map-a.crushmap" --test --show-mappings --show-statistics --rule 0 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 1
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (data), x = 0..1023, numrep = 1..10
   CRUSH rule 0 x 0 [36]
diff --git a/src/test/cli/crushtool/test-map-firefly-tunables.t b/src/test/cli/crushtool/test-map-firefly-tunables.t
index 481b6fd..a75e89f 100644
--- a/src/test/cli/crushtool/test-map-firefly-tunables.t
+++ b/src/test/cli/crushtool/test-map-firefly-tunables.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-statistics --rule 0 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 1 --set-chooseleaf-vary-r 1 --weight 12 0 --weight 20 0 --weight 30 0
+  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 0 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 1 --set-chooseleaf-vary-r 1 --weight 12 0 --weight 20 0 --weight 30 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (data), x = 0..1023, numrep = 1..10
   CRUSH rule 0 x 0 [101]
diff --git a/src/test/cli/crushtool/test-map-firstn-indep.t b/src/test/cli/crushtool/test-map-firstn-indep.t
new file mode 100644
index 0000000..1b8c736
--- /dev/null
+++ b/src/test/cli/crushtool/test-map-firstn-indep.t
@@ -0,0 +1,14 @@
+  $ crushtool -c "$TESTDIR/test-map-firstn-indep.txt" -o "$TESTDIR/test-map-firstn-indep.crushmap"
+  $ crushtool -i "$TESTDIR/test-map-firstn-indep.crushmap" --test --rule 0 --x 1 --show-bad-mappings
+  bad mapping rule 0 x 1 num_rep 9 result [93,80,88,87,56,50,53,72]
+  bad mapping rule 0 x 1 num_rep 10 result [93,80,88,87,56,50,53,72]
+  $ crushtool -i "$TESTDIR/test-map-firstn-indep.crushmap" --test --rule 1 --x 1 --show-bad-mappings
+  bad mapping rule 1 x 1 num_rep 3 result [93,56]
+  bad mapping rule 1 x 1 num_rep 4 result [93,56]
+  bad mapping rule 1 x 1 num_rep 5 result [93,56]
+  bad mapping rule 1 x 1 num_rep 6 result [93,56]
+  bad mapping rule 1 x 1 num_rep 7 result [93,56]
+  bad mapping rule 1 x 1 num_rep 8 result [93,56]
+  bad mapping rule 1 x 1 num_rep 9 result [93,56]
+  bad mapping rule 1 x 1 num_rep 10 result [93,56]
+  $ rm -f "$TESTDIR/test-map-firstn-indep.crushmap"
diff --git a/src/test/cli/crushtool/test-map-firstn-indep.txt b/src/test/cli/crushtool/test-map-firstn-indep.txt
new file mode 100644
index 0000000..4534eab
--- /dev/null
+++ b/src/test/cli/crushtool/test-map-firstn-indep.txt
@@ -0,0 +1,443 @@
+# begin crush map
+tunable choose_local_tries 0
+tunable choose_local_fallback_tries 0
+tunable choose_total_tries 50
+tunable chooseleaf_descend_once 1
+
+# devices
+device 0 device0
+device 1 device1
+device 2 device2
+device 3 device3
+device 4 device4
+device 5 device5
+device 6 device6
+device 7 device7
+device 8 device8
+device 9 device9
+device 10 device10
+device 11 device11
+device 12 device12
+device 13 device13
+device 14 device14
+device 15 device15
+device 16 device16
+device 17 device17
+device 18 device18
+device 19 device19
+device 20 device20
+device 21 device21
+device 22 device22
+device 23 device23
+device 24 device24
+device 25 device25
+device 26 device26
+device 27 device27
+device 28 device28
+device 29 device29
+device 30 device30
+device 31 device31
+device 32 device32
+device 33 device33
+device 34 device34
+device 35 device35
+device 36 device36
+device 37 device37
+device 38 device38
+device 39 device39
+device 40 device40
+device 41 device41
+device 42 device42
+device 43 device43
+device 44 device44
+device 45 device45
+device 46 device46
+device 47 device47
+device 48 device48
+device 49 device49
+device 50 device50
+device 51 device51
+device 52 device52
+device 53 device53
+device 54 device54
+device 55 device55
+device 56 device56
+device 57 device57
+device 58 device58
+device 59 device59
+device 60 device60
+device 61 device61
+device 62 device62
+device 63 device63
+device 64 device64
+device 65 device65
+device 66 device66
+device 67 device67
+device 68 device68
+device 69 device69
+device 70 device70
+device 71 device71
+device 72 device72
+device 73 device73
+device 74 device74
+device 75 device75
+device 76 device76
+device 77 device77
+device 78 device78
+device 79 device79
+device 80 device80
+device 81 device81
+device 82 device82
+device 83 device83
+device 84 device84
+device 85 device85
+device 86 device86
+device 87 device87
+device 88 device88
+device 89 device89
+device 90 device90
+device 91 device91
+device 92 device92
+device 93 device93
+device 94 device94
+device 95 device95
+device 96 device96
+device 97 device97
+device 98 device98
+device 99 device99
+
+# types
+type 0 device
+type 1 host
+type 2 rack
+type 3 default
+
+# buckets
+host host0 {
+	id -1		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device0 weight 1.000
+	item device1 weight 1.000
+	item device2 weight 1.000
+	item device3 weight 1.000
+}
+host host1 {
+	id -2		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device4 weight 1.000
+	item device5 weight 1.000
+	item device6 weight 1.000
+	item device7 weight 1.000
+}
+host host2 {
+	id -3		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device8 weight 1.000
+	item device9 weight 1.000
+	item device10 weight 1.000
+	item device11 weight 1.000
+}
+host host3 {
+	id -4		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device12 weight 1.000
+	item device13 weight 1.000
+	item device14 weight 1.000
+	item device15 weight 1.000
+}
+host host4 {
+	id -5		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device16 weight 1.000
+	item device17 weight 1.000
+	item device18 weight 1.000
+	item device19 weight 1.000
+}
+host host5 {
+	id -6		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device20 weight 1.000
+	item device21 weight 1.000
+	item device22 weight 1.000
+	item device23 weight 1.000
+}
+host host6 {
+	id -7		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device24 weight 1.000
+	item device25 weight 1.000
+	item device26 weight 1.000
+	item device27 weight 1.000
+}
+host host7 {
+	id -8		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device28 weight 1.000
+	item device29 weight 1.000
+	item device30 weight 1.000
+	item device31 weight 1.000
+}
+host host8 {
+	id -9		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device32 weight 1.000
+	item device33 weight 1.000
+	item device34 weight 1.000
+	item device35 weight 1.000
+}
+host host9 {
+	id -10		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device36 weight 1.000
+	item device37 weight 1.000
+	item device38 weight 1.000
+	item device39 weight 1.000
+}
+host host10 {
+	id -11		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device40 weight 1.000
+	item device41 weight 1.000
+	item device42 weight 1.000
+	item device43 weight 1.000
+}
+host host11 {
+	id -12		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device44 weight 1.000
+	item device45 weight 1.000
+	item device46 weight 1.000
+	item device47 weight 1.000
+}
+host host12 {
+	id -13		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device48 weight 1.000
+	item device49 weight 1.000
+	item device50 weight 1.000
+	item device51 weight 1.000
+}
+host host13 {
+	id -14		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device52 weight 1.000
+	item device53 weight 1.000
+	item device54 weight 1.000
+	item device55 weight 1.000
+}
+host host14 {
+	id -15		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device56 weight 1.000
+	item device57 weight 1.000
+	item device58 weight 1.000
+	item device59 weight 1.000
+}
+host host15 {
+	id -16		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device60 weight 1.000
+	item device61 weight 1.000
+	item device62 weight 1.000
+	item device63 weight 1.000
+}
+host host16 {
+	id -17		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device64 weight 1.000
+	item device65 weight 1.000
+	item device66 weight 1.000
+	item device67 weight 1.000
+}
+host host17 {
+	id -18		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device68 weight 1.000
+	item device69 weight 1.000
+	item device70 weight 1.000
+	item device71 weight 1.000
+}
+host host18 {
+	id -19		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device72 weight 1.000
+	item device73 weight 1.000
+	item device74 weight 1.000
+	item device75 weight 1.000
+}
+host host19 {
+	id -20		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device76 weight 1.000
+	item device77 weight 1.000
+	item device78 weight 1.000
+	item device79 weight 1.000
+}
+host host20 {
+	id -21		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device80 weight 1.000
+	item device81 weight 1.000
+	item device82 weight 1.000
+	item device83 weight 1.000
+}
+host host21 {
+	id -22		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device84 weight 1.000
+	item device85 weight 1.000
+	item device86 weight 1.000
+	item device87 weight 1.000
+}
+host host22 {
+	id -23		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device88 weight 1.000
+	item device89 weight 1.000
+	item device90 weight 1.000
+	item device91 weight 1.000
+}
+host host23 {
+	id -24		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device92 weight 1.000
+	item device93 weight 1.000
+	item device94 weight 1.000
+	item device95 weight 1.000
+}
+host host24 {
+	id -25		# do not change unnecessarily
+	# weight 4.000
+	alg straw
+	hash 0	# rjenkins1
+	item device96 weight 1.000
+	item device97 weight 1.000
+	item device98 weight 1.000
+	item device99 weight 1.000
+}
+rack rack0 {
+	id -26		# do not change unnecessarily
+	# weight 40.000
+	alg straw
+	hash 0	# rjenkins1
+	item host0 weight 4.000
+	item host1 weight 4.000
+	item host2 weight 4.000
+	item host3 weight 4.000
+	item host4 weight 4.000
+	item host5 weight 4.000
+	item host6 weight 4.000
+	item host7 weight 4.000
+	item host8 weight 4.000
+	item host9 weight 4.000
+}
+rack rack1 {
+	id -27		# do not change unnecessarily
+	# weight 40.000
+	alg straw
+	hash 0	# rjenkins1
+	item host10 weight 4.000
+	item host11 weight 4.000
+	item host12 weight 4.000
+	item host13 weight 4.000
+	item host14 weight 4.000
+	item host15 weight 4.000
+	item host16 weight 4.000
+	item host17 weight 4.000
+	item host18 weight 4.000
+	item host19 weight 4.000
+}
+rack rack2 {
+	id -28		# do not change unnecessarily
+	# weight 20.000
+	alg straw
+	hash 0	# rjenkins1
+	item host20 weight 4.000
+	item host21 weight 4.000
+	item host22 weight 4.000
+	item host23 weight 4.000
+	item host24 weight 4.000
+}
+
+default root {
+	id -31		# do not change unnecessarily
+	# weight 100.000
+	alg straw
+	hash 0	# rjenkins1
+	item rack1 weight 40.000
+	item rack1 weight 40.000
+	item rack2 weight 20.000
+}
+
+# rules
+rule myrule {
+	ruleset 0
+	type replicated
+	min_size 1
+	max_size 10
+	step take root
+	step choose firstn 2 type rack
+	step chooseleaf indep 4 type host
+	step emit
+}
+
+rule myrule1 {
+	ruleset 1
+	type replicated
+	min_size 1
+	max_size 10
+	step take root
+	step choose firstn 2 type rack
+	step chooseleaf indep 1 type host
+	step emit
+}
+
+# end crush map
diff --git a/src/test/cli/crushtool/test-map-indep.t b/src/test/cli/crushtool/test-map-indep.t
index 5f6dbb3..f4ee371 100644
--- a/src/test/cli/crushtool/test-map-indep.t
+++ b/src/test/cli/crushtool/test-map-indep.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-indep.crushmap" --test --show-statistics --rule 1 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 2
+  $ crushtool -i "$TESTDIR/test-map-indep.crushmap" --test --show-mappings --show-statistics --rule 1 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 2
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 1 (metadata), x = 0..1023, numrep = 1..10
   CRUSH rule 1 x 0 [36]
diff --git a/src/test/cli/crushtool/test-map-legacy-tunables.t b/src/test/cli/crushtool/test-map-legacy-tunables.t
index 12bf604..fe28c70 100644
--- a/src/test/cli/crushtool/test-map-legacy-tunables.t
+++ b/src/test/cli/crushtool/test-map-legacy-tunables.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-a.crushmap" --test --show-statistics --rule 0
+  $ crushtool -i "$TESTDIR/test-map-a.crushmap" --test --show-mappings --show-statistics --rule 0
   rule 0 (data), x = 0..1023, numrep = 1..10
   CRUSH rule 0 x 0 [36]
   CRUSH rule 0 x 1 [876]
diff --git a/src/test/cli/crushtool/test-map-tries-vs-retries.t b/src/test/cli/crushtool/test-map-tries-vs-retries.t
index 8eac255..2a49838 100644
--- a/src/test/cli/crushtool/test-map-tries-vs-retries.t
+++ b/src/test/cli/crushtool/test-map-tries-vs-retries.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-tries-vs-retries.crushmap" --test --show-statistics --weight 0 0 --weight 8 0
+  $ crushtool -i "$TESTDIR/test-map-tries-vs-retries.crushmap" --test --show-mappings --show-statistics --weight 0 0 --weight 8 0
   rule 0 (replicated_ruleset), x = 0..1023, numrep = 1..10
   CRUSH rule 0 x 0 [7]
   CRUSH rule 0 x 1 [10]
diff --git a/src/test/cli/crushtool/test-map-vary-r-0.t b/src/test/cli/crushtool/test-map-vary-r-0.t
index 663ef65..eefd862 100644
--- a/src/test/cli/crushtool/test-map-vary-r-0.t
+++ b/src/test/cli/crushtool/test-map-vary-r-0.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-statistics --rule 3 --set-chooseleaf-vary-r 0 --weight 0 0 --weight 4 0 --weight 9 0
+  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 0 --weight 0 0 --weight 4 0 --weight 9 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,85]
diff --git a/src/test/cli/crushtool/test-map-vary-r-1.t b/src/test/cli/crushtool/test-map-vary-r-1.t
index 4ac4c22..a21b9d5 100644
--- a/src/test/cli/crushtool/test-map-vary-r-1.t
+++ b/src/test/cli/crushtool/test-map-vary-r-1.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-statistics --rule 3 --set-chooseleaf-vary-r 1 --weight 0 0 --weight 4 0 --weight 9 0
+  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 1 --weight 0 0 --weight 4 0 --weight 9 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,6]
diff --git a/src/test/cli/crushtool/test-map-vary-r-2.t b/src/test/cli/crushtool/test-map-vary-r-2.t
index c9e78c6..eaf0542 100644
--- a/src/test/cli/crushtool/test-map-vary-r-2.t
+++ b/src/test/cli/crushtool/test-map-vary-r-2.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-statistics --rule 3 --set-chooseleaf-vary-r 2 --weight 0 0 --weight 4 0 --weight 9 0
+  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 2 --weight 0 0 --weight 4 0 --weight 9 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,45]
diff --git a/src/test/cli/crushtool/test-map-vary-r-3.t b/src/test/cli/crushtool/test-map-vary-r-3.t
index ad02e73..31943b2 100644
--- a/src/test/cli/crushtool/test-map-vary-r-3.t
+++ b/src/test/cli/crushtool/test-map-vary-r-3.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-statistics --rule 3 --set-chooseleaf-vary-r 3 --weight 0 0 --weight 4 0 --weight 9 0
+  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 3 --weight 0 0 --weight 4 0 --weight 9 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,85]
diff --git a/src/test/cli/crushtool/test-map-vary-r-4.t b/src/test/cli/crushtool/test-map-vary-r-4.t
index 059da77..24cf0ba 100644
--- a/src/test/cli/crushtool/test-map-vary-r-4.t
+++ b/src/test/cli/crushtool/test-map-vary-r-4.t
@@ -1,4 +1,4 @@
-  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-statistics --rule 3 --set-chooseleaf-vary-r 4 --weight 0 0 --weight 4 0 --weight 9 0
+  $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 4 --weight 0 0 --weight 4 0 --weight 9 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,85]
diff --git a/src/test/cli/crushtool/tree.template b/src/test/cli/crushtool/tree.template
new file mode 100644
index 0000000..9808578
Binary files /dev/null and b/src/test/cli/crushtool/tree.template differ
diff --git a/src/test/cli/crushtool/tree.template.final b/src/test/cli/crushtool/tree.template.final
new file mode 100644
index 0000000..6af0701
--- /dev/null
+++ b/src/test/cli/crushtool/tree.template.final
@@ -0,0 +1,70 @@
+# begin crush map
+
+# devices
+device 0 device0
+device 1 device1
+device 2 device2
+device 3 device3
+device 4 device4
+device 5 device5
+device 6 device6
+device 7 device7
+
+# types
+type 0 device
+type 1 host
+type 2 cluster
+
+# buckets
+host host0 {
+	id -2		# do not change unnecessarily
+	# weight 8.000
+	alg tree	# do not change pos for existing items unnecessarily
+	hash 0	# rjenkins1
+	item device0 weight 1.000 pos 0
+	item device1 weight 1.000 pos 1
+	item device2 weight 1.000 pos 2
+	item device3 weight 1.000 pos 3
+	item device4 weight 1.000 pos 4
+	item device5 weight 1.000 pos 5
+	item device6 weight 1.000 pos 6
+	item device7 weight 1.000 pos 7
+}
+cluster cluster0 {
+	id -1		# do not change unnecessarily
+	# weight 8.000
+	alg tree	# do not change pos for existing items unnecessarily
+	hash 0	# rjenkins1
+	item host0 weight 8.000 pos 0
+}
+
+# rules
+rule data {
+	ruleset 0
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule metadata {
+	ruleset 1
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+rule rbd {
+	ruleset 2
+	type replicated
+	min_size 1
+	max_size 10
+	step take cluster0
+	step chooseleaf firstn 0 type host
+	step emit
+}
+
+# end crush map
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index 9ebd274..b642cfb 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -11,6 +11,7 @@
   tunable choose_local_fallback_tries 0
   tunable choose_total_tries 50
   tunable chooseleaf_descend_once 1
+  tunable straw_calc_version 1
   
   # devices
   device 0 osd.0
diff --git a/src/test/cli/osdmaptool/create-racks.t b/src/test/cli/osdmaptool/create-racks.t
index 33fa9ee..11e3223 100644
--- a/src/test/cli/osdmaptool/create-racks.t
+++ b/src/test/cli/osdmaptool/create-racks.t
@@ -10,6 +10,7 @@
   tunable choose_local_fallback_tries 0
   tunable choose_total_tries 50
   tunable chooseleaf_descend_once 1
+  tunable straw_calc_version 1
   
   # devices
   device 0 device0
diff --git a/src/test/cli/osdmaptool/crush.t b/src/test/cli/osdmaptool/crush.t
index 5833da8..584da09 100644
--- a/src/test/cli/osdmaptool/crush.t
+++ b/src/test/cli/osdmaptool/crush.t
@@ -6,5 +6,5 @@
   osdmaptool: exported crush map to oc
   $ osdmaptool --import-crush oc myosdmap
   osdmaptool: osdmap file 'myosdmap'
-  osdmaptool: imported 486 byte crush map from oc
+  osdmaptool: imported 487 byte crush map from oc
   osdmaptool: writing epoch 3 to myosdmap
diff --git a/src/test/cli/osdmaptool/help.t b/src/test/cli/osdmaptool/help.t
index 2c5a41d..02f56ef 100644
--- a/src/test/cli/osdmaptool/help.t
+++ b/src/test/cli/osdmaptool/help.t
@@ -4,6 +4,7 @@
      --export-crush <file>   write osdmap's crush map to <file>
      --import-crush <file>   replace osdmap's crush map with <file>
      --test-map-pgs [--pool <poolid>] map all pgs
+     --test-map-pgs-dump [--pool <poolid>] map all pgs
      --mark-up-in            mark osds up and in (but do not persist)
      --clear-temp            clear pg_temp and primary_temp
      --test-random           do random placements
diff --git a/src/test/cli/osdmaptool/missing-argument.t b/src/test/cli/osdmaptool/missing-argument.t
index d0740ab..87ab3eb 100644
--- a/src/test/cli/osdmaptool/missing-argument.t
+++ b/src/test/cli/osdmaptool/missing-argument.t
@@ -4,6 +4,7 @@
      --export-crush <file>   write osdmap's crush map to <file>
      --import-crush <file>   replace osdmap's crush map with <file>
      --test-map-pgs [--pool <poolid>] map all pgs
+     --test-map-pgs-dump [--pool <poolid>] map all pgs
      --mark-up-in            mark osds up and in (but do not persist)
      --clear-temp            clear pg_temp and primary_temp
      --test-random           do random placements
diff --git a/src/test/cli/osdmaptool/test-map-pgs.t b/src/test/cli/osdmaptool/test-map-pgs.t
index b64f2d9..222bd76 100644
--- a/src/test/cli/osdmaptool/test-map-pgs.t
+++ b/src/test/cli/osdmaptool/test-map-pgs.t
@@ -24,7 +24,7 @@
   pool 1 pg_num 8000
   pool 2 pg_num 8000
   $ TOTAL=$((POOL_COUNT * $PG_NUM))
-  $ PATTERN=$(echo "size $SIZE\t$TOTAL")
+  $ PATTERN=$(echo "size $SIZE.$TOTAL")
   $ grep "$PATTERN" $OUT || cat "$OUT"
   size 3\t24000 (esc)
   $ STATS_CRUSH=$(grep '^ avg ' "$OUT")
@@ -39,7 +39,7 @@
   pool 1 pg_num 8000
   pool 2 pg_num 8000
   $ TOTAL=$((POOL_COUNT * $PG_NUM))
-  $ PATTERN=$(echo "size $SIZE\t$TOTAL")
+  $ PATTERN=$(echo "size $SIZE.$TOTAL")
   $ grep "$PATTERN" $OUT || cat "$OUT"
   size 3\t24000 (esc)
   $ STATS_RANDOM=$(grep '^ avg ' "$OUT")
diff --git a/src/test/common/histogram.cc b/src/test/common/histogram.cc
index 2fd3cfe..765f4c9 100644
--- a/src/test/common/histogram.cc
+++ b/src/test/common/histogram.cc
@@ -47,61 +47,64 @@ TEST(Histogram, Set) {
 }
 
 TEST(Histogram, Position) {
-  {
-    pow2_hist_t h;
-    uint64_t lb, ub;
-    h.add(0);
-    ASSERT_EQ(-1, h.get_position_micro(-20, &lb, &ub));
-  }
-  {
-    pow2_hist_t h;
-    h.add(0);
-    uint64_t lb, ub;
-    h.get_position_micro(0, &lb, &ub);
-    ASSERT_EQ(0u, lb);
-    ASSERT_EQ(1000000u, ub);
-    h.add(0);
-    h.add(0);
-    h.add(0);
-    h.get_position_micro(0, &lb, &ub);
-    ASSERT_EQ(0u, lb);
-    ASSERT_EQ(1000000u, ub);
-  }
-  {
-    pow2_hist_t h;
-    h.add(1);
-    h.add(1);
-    uint64_t lb, ub;
-    h.get_position_micro(0, &lb, &ub);
-    ASSERT_EQ(0u, lb);
-    ASSERT_EQ(0u, ub);
-    h.add(0);
-    h.get_position_micro(0, &lb, &ub);
-    ASSERT_EQ(0u, lb);
-    ASSERT_EQ(333333u, ub);
-    h.get_position_micro(1, &lb, &ub);
-    ASSERT_EQ(333333u, lb);
-    ASSERT_EQ(1000000u, ub);
-  }
-  {
-    pow2_hist_t h;
-    h.h.resize(10, 0);
-    h.h[0] = 1;
-    h.h[5] = 1;
-    uint64_t lb, ub;
-    h.get_position_micro(4, &lb, &ub);
-    ASSERT_EQ(500000u, lb);
-    ASSERT_EQ(500000u, ub);
-  }
-  {
-    pow2_hist_t h;
-    h.h.resize(10, 0);
-    h.h[0] = UINT_MAX;
-    h.h[5] = UINT_MAX;
-    uint64_t lb, ub;
-    ASSERT_EQ(500000u, lb);
-    ASSERT_EQ(500000u, ub);
-  }
+  pow2_hist_t h;
+  uint64_t lb, ub;
+  h.add(0);
+  ASSERT_EQ(-1, h.get_position_micro(-20, &lb, &ub));
+}
+
+TEST(Histogram, Position1) {
+  pow2_hist_t h;
+  h.add(0);
+  uint64_t lb, ub;
+  h.get_position_micro(0, &lb, &ub);
+  ASSERT_EQ(0u, lb);
+  ASSERT_EQ(1000000u, ub);
+  h.add(0);
+  h.add(0);
+  h.add(0);
+  h.get_position_micro(0, &lb, &ub);
+  ASSERT_EQ(0u, lb);
+  ASSERT_EQ(1000000u, ub);
+}
+
+TEST(Histogram, Position2) {
+  pow2_hist_t h;
+  h.add(1);
+  h.add(1);
+  uint64_t lb, ub;
+  h.get_position_micro(0, &lb, &ub);
+  ASSERT_EQ(0u, lb);
+  ASSERT_EQ(0u, ub);
+  h.add(0);
+  h.get_position_micro(0, &lb, &ub);
+  ASSERT_EQ(0u, lb);
+  ASSERT_EQ(333333u, ub);
+  h.get_position_micro(1, &lb, &ub);
+  ASSERT_EQ(333333u, lb);
+  ASSERT_EQ(1000000u, ub);
+}
+
+TEST(Histogram, Position3) {
+  pow2_hist_t h;
+  h.h.resize(10, 0);
+  h.h[0] = 1;
+  h.h[5] = 1;
+  uint64_t lb, ub;
+  h.get_position_micro(4, &lb, &ub);
+  ASSERT_EQ(500000u, lb);
+  ASSERT_EQ(500000u, ub);
+}
+
+TEST(Histogram, Position4) {
+  pow2_hist_t h;
+  h.h.resize(10, 0);
+  h.h[0] = UINT_MAX;
+  h.h[5] = UINT_MAX;
+  uint64_t lb, ub;
+  h.get_position_micro(4, &lb, &ub);
+  ASSERT_EQ(0u, lb);
+  ASSERT_EQ(0u, ub);
 }
 
 TEST(Histogram, Decay) {
diff --git a/src/test/common/test_io_priority.cc b/src/test/common/test_io_priority.cc
new file mode 100644
index 0000000..b2d4e26
--- /dev/null
+++ b/src/test/common/test_io_priority.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include <gtest/gtest.h>
+
+#include "common/io_priority.h"
+
+TEST(io_priority, ceph_ioprio_string_to_class) {
+  ASSERT_EQ(IOPRIO_CLASS_IDLE, ceph_ioprio_string_to_class("idle"));
+  ASSERT_EQ(IOPRIO_CLASS_IDLE, ceph_ioprio_string_to_class("IDLE"));
+
+  ASSERT_EQ(IOPRIO_CLASS_BE, ceph_ioprio_string_to_class("be"));
+  ASSERT_EQ(IOPRIO_CLASS_BE, ceph_ioprio_string_to_class("BE"));
+  ASSERT_EQ(IOPRIO_CLASS_BE, ceph_ioprio_string_to_class("besteffort"));
+  ASSERT_EQ(IOPRIO_CLASS_BE, ceph_ioprio_string_to_class("BESTEFFORT"));
+  ASSERT_EQ(IOPRIO_CLASS_BE, ceph_ioprio_string_to_class("best effort"));
+  ASSERT_EQ(IOPRIO_CLASS_BE, ceph_ioprio_string_to_class("BEST EFFORT"));
+
+  ASSERT_EQ(IOPRIO_CLASS_RT, ceph_ioprio_string_to_class("rt"));
+  ASSERT_EQ(IOPRIO_CLASS_RT, ceph_ioprio_string_to_class("RT"));
+  ASSERT_EQ(IOPRIO_CLASS_RT, ceph_ioprio_string_to_class("realtime"));
+  ASSERT_EQ(IOPRIO_CLASS_RT, ceph_ioprio_string_to_class("REALTIME"));
+  ASSERT_EQ(IOPRIO_CLASS_RT, ceph_ioprio_string_to_class("real time"));
+  ASSERT_EQ(IOPRIO_CLASS_RT, ceph_ioprio_string_to_class("REAL TIME"));
+
+  ASSERT_EQ(-EINVAL, ceph_ioprio_string_to_class("invalid"));
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; 
+ *   make -j4 unittest_io_priority &&
+ *   libtool --mode=execute valgrind --tool=memcheck --leak-check=full \
+ *      ./unittest_io_priority
+ *   "
+ * End:
+ */
diff --git a/src/test/crush/TestCrushWrapper.cc b/src/test/crush/TestCrushWrapper.cc
index 34d6401..f5dcfa1 100644
--- a/src/test/crush/TestCrushWrapper.cc
+++ b/src/test/crush/TestCrushWrapper.cc
@@ -67,6 +67,166 @@ TEST(CrushWrapper, get_immediate_parent) {
   delete c;
 }
 
+TEST(CrushWrapper, straw_zero) {
+  // zero weight items should have no effect on placement.
+
+  CrushWrapper *c = new CrushWrapper;
+  const int ROOT_TYPE = 1;
+  c->set_type_name(ROOT_TYPE, "root");
+  const int OSD_TYPE = 0;
+  c->set_type_name(OSD_TYPE, "osd");
+
+  int n = 5;
+  int items[n], weights[n];
+  for (int i=0; i <n; ++i) {
+    items[i] = i;
+    weights[i] = 0x10000 * (n-i-1);
+  }
+
+  c->set_max_devices(n);
+
+  string root_name0("root0");
+  int root0;
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+			     ROOT_TYPE, n, items, weights, &root0));
+  EXPECT_EQ(0, c->set_item_name(root0, root_name0));
+
+  string name0("rule0");
+  int ruleset0 = c->add_simple_ruleset(name0, root_name0, "osd",
+				       "firstn", pg_pool_t::TYPE_REPLICATED);
+  EXPECT_EQ(0, ruleset0);
+
+  string root_name1("root1");
+  int root1;
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+			     ROOT_TYPE, n-1, items, weights, &root1));
+  EXPECT_EQ(0, c->set_item_name(root1, root_name1));
+
+  string name1("rule1");
+  int ruleset1 = c->add_simple_ruleset(name1, root_name1, "osd",
+				       "firstn", pg_pool_t::TYPE_REPLICATED);
+  EXPECT_EQ(1, ruleset1);
+
+  vector<unsigned> reweight(n, 0x10000);
+  for (int i=0; i<10000; ++i) {
+    vector<int> out0, out1;
+    c->do_rule(ruleset0, i, out0, 1, reweight);
+    ASSERT_EQ(1u, out0.size());
+    c->do_rule(ruleset1, i, out1, 1, reweight);
+    ASSERT_EQ(1u, out1.size());
+    ASSERT_EQ(out0[0], out1[0]);
+    //cout << i << "\t" << out0 << "\t" << out1 << std::endl;
+  }
+}
+
+TEST(CrushWrapper, straw_same) {
+  // items with the same weight should map about the same as items
+  // with very similar weights.
+  //
+  // give the 0 vector a paired stair pattern, with dup weights.  note
+  // that the original straw flaw does not appear when there are 2 of
+  // the initial weight, but it does when there is just 1.
+  //
+  // give the 1 vector a similar stair pattern, but make the same
+  // steps weights slightly different (no dups).  this works.
+  //
+  // compare the result and verify that the resulting mapping is
+  // almost identical.
+
+  CrushWrapper *c = new CrushWrapper;
+  const int ROOT_TYPE = 1;
+  c->set_type_name(ROOT_TYPE, "root");
+  const int OSD_TYPE = 0;
+  c->set_type_name(OSD_TYPE, "osd");
+
+  int n = 10;
+  int items[n], weights[n];
+  for (int i=0; i <n; ++i) {
+    items[i] = i;
+    weights[i] = 0x10000 * ((i+1)/2 + 1);
+  }
+
+  c->set_max_devices(n);
+
+  string root_name0("root0");
+  int root0;
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+			     ROOT_TYPE, n, items, weights, &root0));
+  EXPECT_EQ(0, c->set_item_name(root0, root_name0));
+
+  string name0("rule0");
+  int ruleset0 = c->add_simple_ruleset(name0, root_name0, "osd",
+				       "firstn", pg_pool_t::TYPE_REPLICATED);
+  EXPECT_EQ(0, ruleset0);
+
+  for (int i=0; i <n; ++i) {
+    items[i] = i;
+    weights[i] = 0x10000 * ((i+1)/2 + 1) + (i%2)*100;
+  }
+
+  string root_name1("root1");
+  int root1;
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+			     ROOT_TYPE, n, items, weights, &root1));
+  EXPECT_EQ(0, c->set_item_name(root1, root_name1));
+
+  string name1("rule1");
+  int ruleset1 = c->add_simple_ruleset(name1, root_name1, "osd",
+				       "firstn", pg_pool_t::TYPE_REPLICATED);
+  EXPECT_EQ(1, ruleset1);
+
+  if (0) {
+    crush_bucket_straw *sb0 = reinterpret_cast<crush_bucket_straw*>(c->get_crush_map()->buckets[-1-root0]);
+    crush_bucket_straw *sb1 = reinterpret_cast<crush_bucket_straw*>(c->get_crush_map()->buckets[-1-root1]);
+
+    for (int i=0; i<n; ++i) {
+      cout << i
+	   << "\t" << sb0->item_weights[i]
+	   << "\t" << sb1->item_weights[i]
+	   << "\t"
+	   << "\t" << sb0->straws[i]
+	   << "\t" << sb1->straws[i]
+	   << std::endl;
+    }
+  }
+
+  if (0) {
+    JSONFormatter jf(true);
+    jf.open_object_section("crush");
+    c->dump(&jf);
+    jf.close_section();
+    jf.flush(cout);
+  }
+
+  vector<int> sum0(n, 0), sum1(n, 0);
+  vector<unsigned> reweight(n, 0x10000);
+  int different = 0;
+  int max = 100000;
+  for (int i=0; i<max; ++i) {
+    vector<int> out0, out1;
+    c->do_rule(ruleset0, i, out0, 1, reweight);
+    ASSERT_EQ(1u, out0.size());
+    c->do_rule(ruleset1, i, out1, 1, reweight);
+    ASSERT_EQ(1u, out1.size());
+    sum0[out0[0]]++;
+    sum1[out1[0]]++;
+    if (out0[0] != out1[0])
+      different++;
+  }
+  for (int i=0; i<n; ++i) {
+    cout << i
+	 << "\t" << ((double)weights[i] / (double)weights[0])
+	 << "\t" << sum0[i] << "\t" << ((double)sum0[i]/(double)sum0[0])
+	 << "\t" << sum1[i] << "\t" << ((double)sum1[i]/(double)sum1[0])
+	 << std::endl;
+  }
+  double ratio = ((double)different / (double)max);
+  cout << different << " of " << max << " = "
+       << ratio
+       << " different" << std::endl;
+  ASSERT_LT(ratio, .001);
+}
+
 TEST(CrushWrapper, move_bucket) {
   CrushWrapper *c = new CrushWrapper;
 
@@ -290,6 +450,116 @@ TEST(CrushWrapper, update_item) {
   delete c;
 }
 
+TEST(CrushWrapper, adjust_item_weight) {
+  CrushWrapper *c = new CrushWrapper;
+
+  const int ROOT_TYPE = 2;
+  c->set_type_name(ROOT_TYPE, "root");
+  const int HOST_TYPE = 1;
+  c->set_type_name(HOST_TYPE, "host");
+  const int OSD_TYPE = 0;
+  c->set_type_name(OSD_TYPE, "osd");
+
+  int rootno;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+		ROOT_TYPE, 0, NULL, NULL, &rootno);
+  c->set_item_name(rootno, "default");
+
+  const string HOST0("host0");
+  int host0;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+		HOST_TYPE, 0, NULL, NULL, &host0);
+  c->set_item_name(host0, HOST0);
+
+  const string FAKE("fake");
+  int hostfake;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+		HOST_TYPE, 0, NULL, NULL, &hostfake);
+  c->set_item_name(hostfake, FAKE);
+
+  int item = 0;
+
+  // construct crush map
+
+  {
+    map<string,string> loc;
+    loc["host"] = "host0";
+    float host_weight = 2.0;
+    int bucket_id = 0;
+
+    item = 0;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+    item = 1;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+
+    bucket_id = c->get_item_id("host0");
+    EXPECT_EQ(true, c->bucket_exists(bucket_id));
+    EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+  }
+
+  {
+    map<string,string> loc;
+    loc["host"] = "fake";
+    float host_weight = 2.0;
+    int bucket_id = 0;
+
+    item = 0;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+    item = 1;
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+				"osd." + stringify(item), loc));
+
+    bucket_id = c->get_item_id("fake");
+    EXPECT_EQ(true, c->bucket_exists(bucket_id));
+    EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+  }
+
+  //
+  //   When there is:
+  //
+  //   default --> host0 --> osd.0 1.0
+  //           |         |
+  //           |         +-> osd.1 1.0
+  //           |
+  //           +-> fake  --> osd.0 1.0
+  //                     |
+  //                     +-> osd.1 1.0
+  //
+  //   Trying to adjust osd.0 weight to 2.0 in all buckets
+  //   Trying to adjust osd.1 weight to 2.0 in host=fake
+  //
+  //   So the crush map will be:
+  //
+  //   default --> host0 --> osd.0 2.0
+  //           |         |
+  //           |         +-> osd.1 1.0
+  //           |
+  //           +-> fake  --> osd.0 2.0
+  //                     |
+  //                     +-> osd.1 2.0
+  //
+
+  float original_weight = 1.0;
+  float modified_weight = 2.0;
+  map<string,string> loc_one, loc_two;
+  loc_one["host"] = "host0";
+  loc_two["host"] = "fake";
+
+  item = 0;
+  EXPECT_EQ(2, c->adjust_item_weightf(g_ceph_context, item, modified_weight));
+  EXPECT_EQ(modified_weight, c->get_item_weightf_in_loc(item, loc_one));
+  EXPECT_EQ(modified_weight, c->get_item_weightf_in_loc(item, loc_two));
+
+  item = 1;
+  EXPECT_EQ(1, c->adjust_item_weightf_in_loc(g_ceph_context, item, modified_weight, loc_two));
+  EXPECT_EQ(original_weight, c->get_item_weightf_in_loc(item, loc_one));
+  EXPECT_EQ(modified_weight, c->get_item_weightf_in_loc(item, loc_two));
+}
+
 TEST(CrushWrapper, insert_item) {
   CrushWrapper *c = new CrushWrapper;
 
diff --git a/src/test/crush/indep.cc b/src/test/crush/indep.cc
index 896e58f..dd0b542 100644
--- a/src/test/crush/indep.cc
+++ b/src/test/crush/indep.cc
@@ -51,18 +51,21 @@ CrushWrapper *build_indep_map(CephContext *cct, int num_rack, int num_host,
       }
     }
   }
-
-  crush_rule *rule = crush_make_rule(4, 0, 123, 1, 20);
-  assert(rule);
-  crush_rule_set_step(rule, 0, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 10, 0);
-  crush_rule_set_step(rule, 1, CRUSH_RULE_TAKE, rootno, 0);
-  crush_rule_set_step(rule, 2,
-		      CRUSH_RULE_CHOOSELEAF_INDEP,
-		      CRUSH_CHOOSE_N,
-		      1);
-  crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
-  int rno = crush_add_rule(c->crush, rule, -1);
-  c->set_rule_name(rno, "data");
+  int ret;
+  int ruleno = 0;
+  int ruleset = 0;
+  ruleno = ruleset;
+  ret = c->add_rule(4, ruleset, 123, 1, 20, ruleno);
+  assert(ret == ruleno);
+  ret = c->set_rule_step(ruleno, 0, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 10, 0);
+  assert(ret == 0);
+  ret = c->set_rule_step(ruleno, 1, CRUSH_RULE_TAKE, rootno, 0);
+  assert(ret == 0);
+  ret = c->set_rule_step(ruleno, 2, CRUSH_RULE_CHOOSELEAF_INDEP, CRUSH_CHOOSE_N, 1);
+  assert(ret == 0);
+  ret = c->set_rule_step(ruleno, 3, CRUSH_RULE_EMIT, 0, 0);
+  assert(ret == 0);
+  c->set_rule_name(ruleno, "data");
 
   if (false) {
     Formatter *f = new_formatter("json-pretty");
@@ -140,7 +143,7 @@ TEST(CRUSH, indep_out_alt) {
   c->dump_tree(weight, &cout, NULL);
 
   // need more retries to get 9/9 hosts for x in 0..99
-  c->crush->choose_total_tries = 100;
+  c->set_choose_total_tries(100);
   for (int x = 0; x < 100; ++x) {
     vector<int> out;
     c->do_rule(0, x, out, 9, weight);
@@ -166,7 +169,7 @@ TEST(CRUSH, indep_out_contig) {
     weight[i] = 0;
   c->dump_tree(weight, &cout, NULL);
 
-  c->crush->choose_total_tries = 100;
+  c->set_choose_total_tries(100);
   for (int x = 0; x < 100; ++x) {
     vector<int> out;
     c->do_rule(0, x, out, 7, weight);
@@ -185,7 +188,7 @@ TEST(CRUSH, indep_out_contig) {
 
 TEST(CRUSH, indep_out_progressive) {
   CrushWrapper *c = build_indep_map(g_ceph_context, 3, 3, 3);
-  c->crush->choose_total_tries = 100;
+  c->set_choose_total_tries(100);
   vector<__u32> tweight(c->get_max_devices(), 0x10000);
   c->dump_tree(tweight, &cout, NULL);
 
diff --git a/src/test/erasure-code/Makefile.am b/src/test/erasure-code/Makefile.am
index fdbe003..c91eef9 100644
--- a/src/test/erasure-code/Makefile.am
+++ b/src/test/erasure-code/Makefile.am
@@ -9,6 +9,14 @@ ceph_erasure_code_benchmark_LDADD += -ldl
 endif
 bin_DEBUGPROGRAMS += ceph_erasure_code_benchmark
 
+ceph_erasure_code_non_regression_SOURCES = \
+	test/erasure-code/ceph_erasure_code_non_regression.cc
+ceph_erasure_code_non_regression_LDADD = $(LIBOSD) $(LIBCOMMON) $(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL)
+if LINUX
+ceph_erasure_code_non_regression_LDADD += -ldl
+endif
+noinst_PROGRAMS += ceph_erasure_code_non_regression
+
 ceph_erasure_code_SOURCES = \
 	test/erasure-code/ceph_erasure_code.cc
 ceph_erasure_code_LDADD = $(LIBOSD) $(LIBCOMMON) $(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL)
diff --git a/src/test/erasure-code/TestErasureCodeJerasure.cc b/src/test/erasure-code/TestErasureCodeJerasure.cc
index 5c637da..4b768a8 100644
--- a/src/test/erasure-code/TestErasureCodeJerasure.cc
+++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
@@ -288,36 +288,6 @@ TEST(ErasureCodeTest, create_ruleset)
     }
   }
 
-  //
-  // The ruleid may be different from the ruleset when a crush rule is
-  // removed because the removed ruleid will be reused but the removed
-  // ruleset will not be reused. 
-  //
-  // This also asserts that the create_ruleset() method returns a
-  // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
-  //
-  {
-    stringstream ss;
-    ErasureCodeJerasureReedSolomonVandermonde jerasure;
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    jerasure.init(parameters);
-    int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
-    int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
-    int FIRST_ruleid = c->get_rule_id("FIRST");
-    EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
-    int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
-    EXPECT_NE(FIRST, ruleset);
-    EXPECT_NE(SECOND, ruleset);
-    EXPECT_NE(ruleset, c->get_rule_id("myrule"));
-    int SECOND_ruleid = c->get_rule_id("SECOND");
-    EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
-    int myrule_ruleid = c->get_rule_id("myrule");
-    EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
-  }
-
   {
     stringstream ss;
     ErasureCodeJerasureReedSolomonVandermonde jerasure;
diff --git a/src/test/erasure-code/ceph_erasure_code_non_regression.cc b/src/test/erasure-code/ceph_erasure_code_non_regression.cc
new file mode 100644
index 0000000..c04accf
--- /dev/null
+++ b/src/test/erasure-code/ceph_erasure_code_non_regression.cc
@@ -0,0 +1,325 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Red Hat (C) 2014 Red Hat <contact at redhat.com>
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include <boost/scoped_ptr.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/program_options/option.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/cmdline.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "erasure-code/ErasureCodePlugin.h"
+
+namespace po = boost::program_options;
+using namespace std;
+
+class ErasureCodeNonRegression {
+  unsigned stripe_width;
+  string plugin;
+  bool create;
+  bool check;
+  string base;
+  string directory;
+  map<string,string> parameters;
+public:
+  int setup(int argc, char** argv);
+  int run();
+  int run_create();
+  int run_check();
+  int decode_erasures(ErasureCodeInterfaceRef erasure_code,
+		      set<int> erasures,
+		      map<int,bufferlist> chunks);
+  string content_path();
+  string chunk_path(unsigned int chunk);
+};
+
+int ErasureCodeNonRegression::setup(int argc, char** argv) {
+
+  po::options_description desc("Allowed options");
+  desc.add_options()
+    ("help,h", "produce help message")
+    ("stripe-width,s", po::value<int>()->default_value(4 * 1024),
+     "stripe_width, i.e. the size of the buffer to be encoded")
+    ("plugin,p", po::value<string>()->default_value("jerasure"),
+     "erasure code plugin name")
+    ("base", po::value<string>()->default_value("."),
+     "prefix all paths with base")
+    ("parameter,P", po::value<vector<string> >(),
+     "parameters")
+    ("create", "create the erasure coded content in the directory")
+    ("check", "check the content in the directory matches the chunks and vice versa")
+    ;
+
+  po::variables_map vm;
+  po::parsed_options parsed =
+    po::command_line_parser(argc, argv).options(desc).allow_unregistered().run();
+  po::store(
+    parsed,
+    vm);
+  po::notify(vm);
+
+  vector<const char *> ceph_options, def_args;
+  vector<string> ceph_option_strings = po::collect_unrecognized(
+    parsed.options, po::include_positional);
+  ceph_options.reserve(ceph_option_strings.size());
+  for (vector<string>::iterator i = ceph_option_strings.begin();
+       i != ceph_option_strings.end();
+       ++i) {
+    ceph_options.push_back(i->c_str());
+  }
+
+  global_init(
+    &def_args, ceph_options, CEPH_ENTITY_TYPE_CLIENT,
+    CODE_ENVIRONMENT_UTILITY,
+    CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+  common_init_finish(g_ceph_context);
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  if (vm.count("help")) {
+    cout << desc << std::endl;
+    return 1;
+  }
+
+  stripe_width = vm["stripe-width"].as<int>();
+  plugin = vm["plugin"].as<string>();
+  base = vm["base"].as<string>();
+  check = vm.count("check") > 0;
+  create = vm.count("create") > 0;
+
+  if (!check && !create) {
+    cerr << "must specifify either --check or --create" << endl;
+    return 1;
+  }
+
+  {
+    stringstream path;
+    path << base << "/" << "plugin=" << plugin << " stipe-width=" << stripe_width;
+    directory = path.str();
+  }
+
+  if (vm.count("parameter")) {
+    const vector<string> &p = vm["parameter"].as< vector<string> >();
+    for (vector<string>::const_iterator i = p.begin();
+	 i != p.end();
+	 ++i) {
+      std::vector<std::string> strs;
+      boost::split(strs, *i, boost::is_any_of("="));
+      if (strs.size() != 2) {
+	cerr << "--parameter " << *i << " ignored because it does not contain exactly one =" << endl;
+      } else {
+	parameters[strs[0]] = strs[1];
+      }
+      if (strs[0] != "directory")
+	directory += " " + *i;
+    }
+  }
+  if (parameters.count("directory") == 0)
+    parameters["directory"] = ".libs";
+
+  return 0;
+}
+
+int ErasureCodeNonRegression::run()
+  {
+  int ret = 0;
+  if(create && (ret = run_create()))
+    return ret;
+  if(check && (ret = run_check()))
+    return ret;
+  return ret;
+}
+
+int ErasureCodeNonRegression::run_create()
+{
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  ErasureCodeInterfaceRef erasure_code;
+  stringstream messages;
+  int code = instance.factory(plugin, parameters, &erasure_code, messages);
+  if (code) {
+    cerr << messages.str() << endl;
+    return code;
+  }
+
+  if (::mkdir(directory.c_str(), 0755)) {
+    cerr << "mkdir(" << directory << "): " << cpp_strerror(errno) << endl;
+    return 1;
+  }
+  unsigned payload_chunk_size = 37;
+  string payload;
+  for (unsigned j = 0; j < payload_chunk_size; ++j)
+    payload.push_back('a' + (rand() % 26));
+  bufferlist in;
+  for (unsigned j = 0; j < stripe_width; j += payload_chunk_size)
+    in.append(payload);
+  if (stripe_width < in.length())
+    in.splice(stripe_width, in.length() - stripe_width);
+  if (in.write_file(content_path().c_str()))
+    return 1;
+  set<int> want_to_encode;
+  for (unsigned int i = 0; i < erasure_code->get_chunk_count(); i++) {
+    want_to_encode.insert(i);
+  }
+  map<int,bufferlist> encoded;
+  code = erasure_code->encode(want_to_encode, in, &encoded);
+  if (code)
+    return code;
+  for (map<int,bufferlist>::iterator chunk = encoded.begin();
+       chunk != encoded.end();
+       chunk++) {
+    if (chunk->second.write_file(chunk_path(chunk->first).c_str()))
+      return 1;
+  }
+  return 0;
+}
+
+int ErasureCodeNonRegression::decode_erasures(ErasureCodeInterfaceRef erasure_code,
+					      set<int> erasures,
+					      map<int,bufferlist> chunks)
+{
+  map<int,bufferlist> available;
+  for (map<int,bufferlist>::iterator chunk = chunks.begin();
+       chunk != chunks.end();
+       ++chunk) {
+    if (erasures.count(chunk->first) == 0)
+      available[chunk->first] = chunk->second;
+      
+  }
+  map<int,bufferlist> decoded;
+  int code = erasure_code->decode(erasures, available, &decoded);
+  if (code)
+    return code;
+  for (set<int>::iterator erasure = erasures.begin();
+       erasure != erasures.end();
+       ++erasure) {
+    if (!chunks[*erasure].contents_equal(decoded[*erasure])) {
+      cerr << "chunk " << *erasure << " incorrectly recovered" << endl;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int ErasureCodeNonRegression::run_check()
+{
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  ErasureCodeInterfaceRef erasure_code;
+  stringstream messages;
+  int code = instance.factory(plugin, parameters, &erasure_code, messages);
+  if (code) {
+    cerr << messages.str() << endl;
+    return code;
+  }
+  string errors;
+  bufferlist in;
+  if (in.read_file(content_path().c_str(), &errors)) {
+    cerr << errors << endl;
+    return 1;
+  }
+  set<int> want_to_encode;
+  for (unsigned int i = 0; i < erasure_code->get_chunk_count(); i++) {
+    want_to_encode.insert(i);
+  }
+
+  map<int,bufferlist> encoded;
+  code = erasure_code->encode(want_to_encode, in, &encoded);
+  if (code)
+    return code;
+
+  for (map<int,bufferlist>::iterator chunk = encoded.begin();
+       chunk != encoded.end();
+       chunk++) {
+    bufferlist existing;
+    if (existing.read_file(chunk_path(chunk->first).c_str(), &errors)) {
+      cerr << errors << endl;
+      return 1;
+    }
+    bufferlist &old = chunk->second;
+    if (existing.length() != old.length() ||
+	memcmp(existing.c_str(), old.c_str(), old.length())) {
+      cerr << "chunk " << chunk->first << " encodes differently" << endl;
+      return 1;
+    }
+  }
+
+  // erasing a single chunk is likely to use a specific code path in every plugin
+  set<int> erasures;
+  erasures.clear();
+  erasures.insert(0);
+  code = decode_erasures(erasure_code, erasures, encoded);
+  if (code)
+    return code;
+
+  if (erasure_code->get_chunk_count() - erasure_code->get_data_chunk_count() > 1) {
+    // erasing two chunks is likely to be the general case
+    erasures.clear();
+    erasures.insert(0);
+    erasures.insert(erasure_code->get_chunk_count() - 1);
+    code = decode_erasures(erasure_code, erasures, encoded);
+    if (code)
+      return code;
+  }
+  
+  return 0;
+}
+
+string ErasureCodeNonRegression::content_path()
+{
+  stringstream path;
+  path << directory << "/content";
+  return path.str();
+}
+
+string ErasureCodeNonRegression::chunk_path(unsigned int chunk)
+{
+  stringstream path;
+  path << directory << "/" << chunk;
+  return path.str();
+}
+
+int main(int argc, char** argv) {
+  ErasureCodeNonRegression non_regression;
+  int err = non_regression.setup(argc, argv);
+  if (err)
+    return err;
+  return non_regression.run();
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ *   make ceph_erasure_code_non_regression &&
+ *   libtool --mode=execute valgrind --tool=memcheck --leak-check=full \
+ *      ./ceph_erasure_code_non_regression \
+ *      --plugin jerasure \
+ *      --parameter directory=.libs \
+ *      --parameter technique=reed_sol_van \
+ *      --parameter k=2 \
+ *      --parameter m=2 \
+ *      --directory /tmp/ceph_erasure_code_non_regression \
+ *      --stripe-width 3181 \
+ *      --create \
+ *      --check
+ * "
+ * End:
+ */
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index 9d917f5..6baadae 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -1138,6 +1138,7 @@ TEST(LibCephFS, GetOsdCrushLocation) {
     }
   }
 
+  ceph_close(cmount, fd);
   ceph_shutdown(cmount);
 }
 
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index ea990b5..38443ce 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -299,7 +299,8 @@ TEST_F(LibRadosMisc, Exec) {
   bufferlist::iterator iter = bl.begin();
   uint64_t all_features;
   ::decode(all_features, iter);
-  ASSERT_EQ(all_features, (uint64_t)RBD_FEATURES_ALL);
+  // make sure *some* features are specified; don't care which ones
+  ASSERT_NE(all_features, 0);
 }
 
 TEST_F(LibRadosMiscPP, ExecPP) {
@@ -311,7 +312,8 @@ TEST_F(LibRadosMiscPP, ExecPP) {
   bufferlist::iterator iter = out.begin();
   uint64_t all_features;
   ::decode(all_features, iter);
-  ASSERT_EQ(all_features, (uint64_t)RBD_FEATURES_ALL);
+  // make sure *some* features are specified; don't care which ones
+  ASSERT_NE(all_features, 0);
 }
 
 TEST_F(LibRadosMiscPP, Operate1PP) {
diff --git a/src/test/librados/snapshots.cc b/src/test/librados/snapshots.cc
index 020af11..01ab62e 100644
--- a/src/test/librados/snapshots.cc
+++ b/src/test/librados/snapshots.cc
@@ -145,6 +145,24 @@ TEST_F(LibRadosSnapshotsPP, SnapGetNamePP) {
   EXPECT_EQ(0, ioctx.snap_remove("snapfoo"));
 }
 
+TEST_F(LibRadosSnapshotsPP, SnapCreateRemovePP) {
+  // reproduces http://tracker.ceph.com/issues/10262
+  bufferlist bl;
+  bl.append("foo");
+  ASSERT_EQ(0, ioctx.write("foo", bl, bl.length(), 0));
+  ASSERT_EQ(0, ioctx.snap_create("snapfoo"));
+  ASSERT_EQ(0, ioctx.remove("foo"));
+  ASSERT_EQ(0, ioctx.snap_create("snapbar"));
+
+  librados::ObjectWriteOperation *op = new librados::ObjectWriteOperation();
+  op->create(false);
+  op->remove();
+  ASSERT_EQ(0, ioctx.operate("foo", op));
+
+  EXPECT_EQ(0, ioctx.snap_remove("snapfoo"));
+  EXPECT_EQ(0, ioctx.snap_remove("snapbar"));
+}
+
 TEST_F(LibRadosSnapshotsSelfManaged, Snap) {
   std::vector<uint64_t> my_snaps;
   my_snaps.push_back(-2);
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 4267389..a89d68b 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -966,6 +966,82 @@ TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
   }
 }
 
+// this test case reproduces http://tracker.ceph.com/issues/8629
+TEST_F(LibRadosTwoPoolsPP, EvictSnap2) {
+  // create object
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  // create a snapshot, clone
+  vector<uint64_t> my_snaps(1);
+  ASSERT_EQ(0, ioctx.selfmanaged_snap_create(&my_snaps[0]));
+  ASSERT_EQ(0, ioctx.selfmanaged_snap_set_write_ctx(my_snaps[0],
+							 my_snaps));
+  {
+    bufferlist bl;
+    bl.append("ciao!");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // read, trigger a promote on the head
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ('c', bl[0]);
+  }
+
+  // evict
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // verify the snapdir is not present in the cache pool
+  {
+    ObjectReadOperation op;
+    librados::snap_set_t snapset;
+    op.list_snaps(&snapset, NULL);
+    ioctx.snap_set_read(librados::SNAP_DIR);
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, ioctx.aio_operate("foo", completion, &op,
+				   librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-ENOENT, completion->get_return_value());
+    completion->release();
+  }
+}
+
 TEST_F(LibRadosTwoPoolsPP, TryFlush) {
   // configure cache
   bufferlist inbl;
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index 7f35418..c37d884 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -21,6 +21,7 @@
 #include "global/global_context.h"
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
+#include "common/config.h"
 
 #include "gtest/gtest.h"
 
@@ -40,6 +41,8 @@
 #include "include/interval_set.h"
 #include "include/stringify.h"
 
+#include <boost/scope_exit.hpp>
+
 using namespace std;
 
 static int get_features(bool *old_format, uint64_t *features)
@@ -67,6 +70,8 @@ static int create_image_full(rados_ioctx_t ioctx, const char *name,
 {
   if (old_format) {
     return rbd_create(ioctx, name, size, order);
+  } else if ((features & RBD_FEATURE_STRIPINGV2) != 0) {
+    return rbd_create3(ioctx, name, size, features, order, 65536, 16);
   } else {
     return rbd_create2(ioctx, name, size, features, order);
   }
@@ -1859,6 +1864,107 @@ TEST(LibRBD, ZeroLengthRead)
   ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
 }
 
+TEST(LibRBD, LargeCacheRead)
+{
+  if (!g_conf->rbd_cache) {
+    std::cout << "SKIPPING due to disabled cache" << std::endl;
+    return;
+  }
+
+  rados_t cluster;
+  rados_ioctx_t ioctx;
+  string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool(pool_name, &cluster));
+  rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+
+  uint64_t orig_cache_size = g_conf->rbd_cache_size;
+  g_conf->set_val("rbd_cache_size", "16777216");
+  BOOST_SCOPE_EXIT( (orig_cache_size) ) {
+    g_conf->set_val("rbd_cache_size", stringify(orig_cache_size).c_str());
+  } BOOST_SCOPE_EXIT_END;
+  ASSERT_EQ(16777216, g_conf->rbd_cache_size);
+
+  rbd_image_t image;
+  int order = 0;
+  const char *name = "testimg";
+  uint64_t size = g_conf->rbd_cache_size + 1;
+
+  ASSERT_EQ(0, create_image(ioctx, name, size, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
+
+  std::string buffer(1 << order, '1');
+  for (size_t offs = 0; offs < size; offs += buffer.size()) {
+    size_t len = std::min<uint64_t>(buffer.size(), size - offs);
+    ASSERT_EQ(static_cast<ssize_t>(len),
+	      rbd_write(image, offs, len, buffer.c_str()));
+  }
+
+  ASSERT_EQ(0, rbd_invalidate_cache(image));
+
+  buffer.resize(size);
+  ASSERT_EQ(static_cast<ssize_t>(size-1024), rbd_read(image, 1024, size, &buffer[0]));
+
+  ASSERT_EQ(0, rbd_close(image));
+
+  rados_ioctx_destroy(ioctx);
+  ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
+}
+
+TEST(LibRBD, TestPendingAio)
+{
+  rados_t cluster;
+  rados_ioctx_t ioctx;
+  string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool(pool_name, &cluster));
+  rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+
+  int features = RBD_FEATURE_LAYERING;
+  rbd_image_t image;
+  int order = 0;
+
+  std::string name = "testimg";
+
+  uint64_t size = 4 << 20;
+  ASSERT_EQ(0, create_image_full(ioctx, name.c_str(), size, &order,
+				 false, features));
+  ASSERT_EQ(0, rbd_open(ioctx, name.c_str(), &image, NULL));
+
+  char test_data[TEST_IO_SIZE];
+  for (size_t i = 0; i < TEST_IO_SIZE; ++i) {
+    test_data[i] = (char) (rand() % (126 - 33) + 33);
+  }
+
+  size_t num_aios = 256;
+  rbd_completion_t comps[num_aios];
+  for (size_t i = 0; i < num_aios; ++i) {
+    ASSERT_EQ(0, rbd_aio_create_completion(NULL, NULL, &comps[i]));
+    uint64_t offset = rand() % (size - TEST_IO_SIZE);
+    ASSERT_EQ(0, rbd_aio_write(image, offset, TEST_IO_SIZE, test_data,
+                               comps[i]));
+  }
+  for (size_t i = 0; i < num_aios; ++i) {
+    ASSERT_EQ(0, rbd_aio_wait_for_complete(comps[i]));
+    rbd_aio_release(comps[i]);
+  }
+  ASSERT_EQ(0, rbd_invalidate_cache(image));
+
+  for (size_t i = 0; i < num_aios; ++i) {
+    ASSERT_EQ(0, rbd_aio_create_completion(NULL, NULL, &comps[i]));
+    uint64_t offset = rand() % (size - TEST_IO_SIZE);
+    ASSERT_LE(0, rbd_aio_read(image, offset, TEST_IO_SIZE, test_data,
+                              comps[i]));
+  }
+
+  ASSERT_EQ(0, rbd_close(image));
+  for (size_t i = 0; i < num_aios; ++i) {
+    ASSERT_EQ(1, rbd_aio_is_complete(comps[i]));
+    rbd_aio_release(comps[i]);
+  }
+
+  rados_ioctx_destroy(ioctx);
+  ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
+}
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/mon/mon-test-helpers.sh b/src/test/mon/mon-test-helpers.sh
index d228569..052b1ca 100644
--- a/src/test/mon/mon-test-helpers.sh
+++ b/src/test/mon/mon-test-helpers.sh
@@ -59,8 +59,9 @@ function run_mon() {
 function kill_daemons() {
     local dir=$1
     for pidfile in $(find $dir | grep pidfile) ; do
+        pid=$(cat $pidfile)
         for try in 0 1 1 1 2 3 ; do
-            kill -9 $(cat $pidfile 2> /dev/null) 2> /dev/null || break
+            kill -9 $pid 2> /dev/null || break
             sleep $try
         done
     done
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index 8dcd79c..863bd94 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -118,6 +118,7 @@ void usage()
   cout << "   --show utilization-all\n";
   cout << "                         include zero weight items\n";
   cout << "   --show-statistics     show chi squared statistics\n";
+  cout << "   --show-mappings       show mappings\n";
   cout << "   --show-bad-mappings   show bad mappings\n";
   cout << "   --show-choose-tries   show choose tries histogram\n";
   cout << "   --set-choose-local-tries N\n";
@@ -190,6 +191,7 @@ int main(int argc, const char **argv)
   int choose_total_tries = -1;
   int chooseleaf_descend_once = -1;
   int chooseleaf_vary_r = -1;
+  int straw_calc_version = -1;
 
   CrushWrapper crush;
 
@@ -233,6 +235,9 @@ int main(int argc, const char **argv)
     } else if (ceph_argparse_flag(args, i, "--show_statistics", (char*)NULL)) {
       display = true;
       tester.set_output_statistics(true);
+    } else if (ceph_argparse_flag(args, i, "--show_mappings", (char*)NULL)) {
+      display = true;
+      tester.set_output_mappings(true);
     } else if (ceph_argparse_flag(args, i, "--show_bad_mappings", (char*)NULL)) {
       display = true;
       tester.set_output_bad_mappings(true);
@@ -263,6 +268,9 @@ int main(int argc, const char **argv)
     } else if (ceph_argparse_withint(args, i, &chooseleaf_vary_r, &err,
 				     "--set_chooseleaf_vary_r", (char*)NULL)) {
       adjust = true;
+    } else if (ceph_argparse_withint(args, i, &straw_calc_version, &err,
+				     "--set_straw_calc_version", (char*)NULL)) {
+      adjust = true;
     } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
       reweight = true;
     } else if (ceph_argparse_withint(args, i, &add_item, &err, "--add_item", (char*)NULL)) {
@@ -581,10 +589,8 @@ int main(int argc, const char **argv)
 	  dout(2) << "  item " << items[j] << " weight " << weights[j] << dendl;
 	}
 
-	crush_bucket *b = crush_make_bucket(buckettype, CRUSH_HASH_DEFAULT, type, j, items, weights);
-	assert(b);
 	int id;
-	int r = crush_add_bucket(crush.crush, 0, b, &id);
+	int r = crush.add_bucket(0, buckettype, CRUSH_HASH_DEFAULT, type, j, items, weights, &id);
 	if (r < 0) {
 	  dout(2) << "Couldn't add bucket: " << cpp_strerror(r) << dendl;
 	}
@@ -712,6 +718,10 @@ int main(int argc, const char **argv)
     crush.set_chooseleaf_vary_r(chooseleaf_vary_r);
     modified = true;
   }
+  if (straw_calc_version >= 0) {
+    crush.set_straw_calc_version(straw_calc_version);
+    modified = true;
+  }
   if (modified) {
     crush.finalize();
 
diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc
index 0db39da..bfeae4f 100644
--- a/src/tools/osdmaptool.cc
+++ b/src/tools/osdmaptool.cc
@@ -35,6 +35,7 @@ void usage()
   cout << "   --export-crush <file>   write osdmap's crush map to <file>" << std::endl;
   cout << "   --import-crush <file>   replace osdmap's crush map with <file>" << std::endl;
   cout << "   --test-map-pgs [--pool <poolid>] map all pgs" << std::endl;
+  cout << "   --test-map-pgs-dump [--pool <poolid>] map all pgs" << std::endl;
   cout << "   --mark-up-in            mark osds up and in (but do not persist)" << std::endl;
   cout << "   --clear-temp            clear pg_temp and primary_temp" << std::endl;
   cout << "   --test-random           do random placements" << std::endl;
@@ -75,6 +76,7 @@ int main(int argc, const char **argv)
   bool mark_up_in = false;
   bool clear_temp = false;
   bool test_map_pgs = false;
+  bool test_map_pgs_dump = false;
   bool test_random = false;
 
   std::string val;
@@ -104,6 +106,8 @@ int main(int argc, const char **argv)
       clear_temp = true;
     } else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) {
       test_map_pgs = true;
+    } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump", (char*)NULL)) {
+      test_map_pgs_dump = true;
     } else if (ceph_argparse_flag(args, i, "--test-random", (char*)NULL)) {
       test_random = true;
     } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) {
@@ -313,7 +317,7 @@ int main(int argc, const char **argv)
          << ") acting (" << acting << ", p" << acting_primary << ")"
          << std::endl;
   }
-  if (test_map_pgs) {
+  if (test_map_pgs || test_map_pgs_dump) {
     if (pool != -1 && !osdmap.have_pg_pool(pool)) {
       cerr << "There is no pool " << pool << std::endl;
       exit(1);
@@ -348,6 +352,9 @@ int main(int argc, const char **argv)
 	}
 	size[osds.size()]++;
 
+	if (test_map_pgs_dump)
+	  cout << pgid << "\t" << osds << "\t" << primary << std::endl;
+
 	for (unsigned i=0; i<osds.size(); i++) {
 	  //cout << " rep " << i << " on " << osds[i] << std::endl;
 	  count[osds[i]]++;
@@ -452,7 +459,7 @@ int main(int argc, const char **argv)
   if (!print && !print_json && !tree && !modified && 
       export_crush.empty() && import_crush.empty() && 
       test_map_pg.empty() && test_map_object.empty() &&
-      !test_map_pgs) {
+      !test_map_pgs && !test_map_pgs_dump) {
     cerr << me << ": no action specified?" << std::endl;
     usage();
   }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list